In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import load_data, save_data
from utilities import temporal_train_test_split
from utilities import print_title, print_label

In [2]:
# Data manipulation and analysis
import pandas as pd

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Model evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

# Machine learning models (regression)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Ensemble methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
def split_dataset_by_date(raw_data: pd.DataFrame, todays_date: str):
    # Filter data by today's date
    filter_data_by_date = raw_data["Date"] == todays_date
    
    # Create a new dataframe with today's data
    todays_data = raw_data[filter_data_by_date].reset_index(drop=True)
    
    # Create a new dataframe with historical data (excluding today's data)
    historical_data = raw_data[~filter_data_by_date].reset_index(drop=True)
    
    return historical_data, todays_data

In [4]:
def filter_data_by_date_range(data: pd.DataFrame, end_date: str):
    # Determine the start date as the minimum date in the dataframe
    start_date = data["Date"].min()
    
    # Create a date range tuple
    date_range = (start_date, end_date)
    
    # Create a filter to exclude data within the specified date range
    remove_data_by_date = (data["Date"] >= date_range[0]) & (data["Date"] < date_range[1])
    
    # Filter the dataframe using the created filter
    filtered_data = data[~remove_data_by_date].reset_index(drop=True)
    
    return filtered_data

In [5]:
file_name = "sp500_adj_close_raw_with_nas"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96mFile `test_w_na.csv.bz2` loaded from `sp500_adj_close_raw_with_nas.zip`[0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


### Data Pre-Processing
___

#### Split todays data (For prediction) and historical data (For training)

In [6]:
todays_date = "2024-10-25"

historical_data, todays_data = split_dataset_by_date(raw_data, todays_date)

print("Todays Date:", todays_date)

Todays Date: 2024-10-25


#### Ensure Missing Values on Todays Data is what we are Predicting

In [7]:
todays_data.isnull().sum()

Date                    0
Ticker                  0
Adjusted Close          0
Next Day Close        501
Previous Day Close      0
Return                  0
Volatility              0
RSI                     0
SMA_50                  0
SMA_100                 0
SMA_200                 0
Upper Band              0
Lower Band              0
Support                 0
Resistance              0
Action                501
dtype: int64

#### Handle Missing Values (NA's)


In [8]:
historical_data.isnull().sum()

Date                       0
Ticker                     0
Adjusted Close             0
Next Day Close             0
Previous Day Close         0
Return                140634
Volatility            142254
RSI                   140913
SMA_50                     0
SMA_100                    0
SMA_200                    0
Upper Band                 0
Lower Band                 0
Support                    0
Resistance                 0
Action                141539
dtype: int64

In [9]:
# Remove all rows where `Adjusted Close` is 0
# Stock did not trade on these days because it did not exist yet
historical_data = historical_data[historical_data["Adjusted Close"] != 0].reset_index(drop=True)

historical_data.isnull().sum()

Date                     0
Ticker                   0
Adjusted Close           0
Next Day Close           0
Previous Day Close       0
Return                  81
Volatility            1701
RSI                    360
SMA_50                   0
SMA_100                  0
SMA_200                  0
Upper Band               0
Lower Band               0
Support                  0
Resistance               0
Action                1067
dtype: int64

In [10]:
# Remove all rows where `Volatility` is NaN
# This is because we don't have the necessary data to calculate volatility at these points
# We will not be able to use these rows for training

historical_data = historical_data.dropna(subset=["Volatility"]).reset_index(drop=True)

historical_data.isnull().sum()

Date                     0
Ticker                   0
Adjusted Close           0
Next Day Close           0
Previous Day Close       0
Return                   0
Volatility               0
RSI                    358
SMA_50                   0
SMA_100                  0
SMA_200                  0
Upper Band               0
Lower Band               0
Support                  0
Resistance               0
Action                1050
dtype: int64

In [11]:
# Backfill the `RSI` column
# This is because the RSI is calculated based on the previous day's data
historical_data["RSI"] = historical_data["RSI"].bfill()

historical_data.isnull().sum()

Date                     0
Ticker                   0
Adjusted Close           0
Next Day Close           0
Previous Day Close       0
Return                   0
Volatility               0
RSI                      0
SMA_50                   0
SMA_100                  0
SMA_200                  0
Upper Band               0
Lower Band               0
Support                  0
Resistance               0
Action                1050
dtype: int64

In [12]:
# Backfill the `Action` column (Trading Signals)
# This is because the trading signal is based on the previous day's data

historical_data["Action"] = historical_data["Action"].bfill()

historical_data.isnull().sum()

Date                  0
Ticker                0
Adjusted Close        0
Next Day Close        0
Previous Day Close    0
Return                0
Volatility            0
RSI                   0
SMA_50                0
SMA_100               0
SMA_200               0
Upper Band            0
Lower Band            0
Support               0
Resistance            0
Action                0
dtype: int64

In [13]:
# Print tickers that do not have any missing values
print_title("Tickers that do not have any missing values", closed_corners=False)

num_tickers = len(historical_data["Ticker"].unique())
print_label("Number of unique tickers:", num_tickers)

missing_val_filter = historical_data.isnull().any(axis=1)

# Group by ticker and check for missing values within each group
grouped = historical_data.groupby("Ticker")

# Tickers with no missing values
tickers_no_missing_values = grouped.filter(lambda x: not x.isnull().any().any())["Ticker"].unique()
print_label("Number of tickers with no missing values:", len(tickers_no_missing_values))

# Tickers with missing values
tickers_missing_values = grouped.filter(lambda x: x.isnull().any().any())["Ticker"].unique()
print_label("Number of tickers with missing values:", len(tickers_missing_values), closed_corners=True)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m          Tickers that do not have any missing values          [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of unique tickers:      |          501.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with no m... |          501.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with miss... |           0.00000           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


In [14]:
import numpy as np

# create numpy arraw with missing values and ticker
tickers_missing_val_count = np.array([(ticker, value) for ticker, value in historical_data[missing_val_filter]["Ticker"].value_counts().items()])

tickers_missing_val_count

array([], dtype=float64)

In [15]:
tickers_no_missing_values

array(['A', 'AAPL', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP',
       'ADSK', 'AEE', 'AEP', 'AES', 'AFL', 'AIG', 'AIZ', 'AJG', 'AKAM',
       'ALB', 'ALGN', 'ALL', 'AMAT', 'AMD', 'AME', 'AMGN', 'AMP', 'AMT',
       'AMZN', 'ANSS', 'AON', 'AOS', 'APA', 'APD', 'APH', 'ARE', 'ATO',
       'AVB', 'AVY', 'AXON', 'AXP', 'AZO', 'BA', 'BAC', 'BALL', 'BAX',
       'BBY', 'BDX', 'BEN', 'BG', 'BIIB', 'BK', 'BKNG', 'BKR', 'BLDR',
       'BLK', 'BMY', 'BR', 'BRO', 'BSX', 'BWA', 'BX', 'BXP', 'C', 'CAG',
       'CAH', 'CAT', 'CB', 'CBRE', 'CCI', 'CCL', 'CDNS', 'CE', 'CF',
       'CHD', 'CHRW', 'CI', 'CINF', 'CL', 'CLX', 'CMCSA', 'CME', 'CMG',
       'CMI', 'CMS', 'CNC', 'CNP', 'COF', 'COO', 'COP', 'COR', 'COST',
       'CPB', 'CPRT', 'CPT', 'CRL', 'CRM', 'CSCO', 'CSGP', 'CSX', 'CTAS',
       'CTRA', 'CTSH', 'CVS', 'CVX', 'D', 'DAL', 'DD', 'DE', 'DECK',
       'DFS', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR', 'DLTR', 'DOC', 'DOV',
       'DPZ', 'DRI', 'DTE', 'DUK', 'DVA', 'DVN', 'DXCM', 'EA', 'EBAY',


### Exploratory Data Analysis (EDA):
___

#### Todays Data:

In [16]:
print("Shape:", todays_data.shape)
display(todays_data.head(2))
display(todays_data.tail(2))

Shape: (501, 16)


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2024-10-25,A,130.16,,130.69,-0.004055,0.012137,18.866226,140.33243,136.65535,137.94844,152.15059,129.75082,130.16,148.244,
1,2024-10-25,AAPL,231.4,,230.57,0.0036,0.013669,66.374374,226.8014,221.79668,200.76884,237.80908,222.31091,216.32,236.48,


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
499,2024-10-25,ZBRA,359.97,,362.05,-0.005745,0.009788,43.769238,355.8908,336.8905,311.04135,380.01144,359.82355,320.77,377.68,
500,2024-10-25,ZTS,179.91,,181.5,-0.00876,0.012576,36.496883,189.094,183.3149,179.2437,197.88783,182.27017,179.91,196.48,


#### Historical Data:

In [17]:
print("Shape:", historical_data.shape)
display(historical_data.head(2))
display(historical_data.tail(2))

Shape: (1978479, 16)


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2008-01-02,A,23.256388,23.025743,23.53828,-0.011976,0.015704,48.82759,23.314175,23.299887,23.564934,24.727251,22.540236,21.392029,24.351946,short
1,2008-01-02,AAPL,5.876342,5.879055,5.974057,-0.016357,0.018937,59.06735,5.518483,4.939064,4.19763,6.135833,5.403559,4.637376,6.02684,buy


Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
1978477,2024-10-24,ZBRA,362.05,359.97,368.09,-0.016409,0.010532,44.39913,355.5556,336.3185,310.51785,379.3288,361.3692,320.77,377.68,short
1978478,2024-10-24,ZTS,181.5,179.91,188.99,-0.039632,0.013328,35.00878,189.1774,183.23051,179.33058,197.27248,184.37552,180.9,196.48,short


#### Optional: For initial phase of training, filter large dataset.

In [18]:
# remove_up_to = "2024-01-01"
# historical_data = filter_data_by_date_range(historical_data, remove_up_to)

# print("Shape:", historical_data.shape)
# display(historical_data.head())
# display(historical_data.tail())

#### Inspect Tickers Individually

In [19]:
# Inspect Cleaned data zeroing in on individual tickers
select_df = raw_data.copy()

select_ticker = select_df["Ticker"] == "AAPL"

select_df[select_ticker]

Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
1,2008-01-02,AAPL,5.876342,5.879055,5.974057,-0.016357,0.018937,59.067350,5.518483,4.939064,4.197630,6.135833,5.403559,4.637376,6.02684,buy
502,2008-01-03,AAPL,5.879055,5.430276,5.876342,0.000462,0.018287,56.259520,5.530891,4.960155,4.213231,6.122779,5.462215,4.637376,6.02684,sell
1003,2008-01-04,AAPL,5.430276,5.357593,5.879055,-0.076335,0.025363,37.311500,5.527205,4.975917,4.226226,6.142305,5.426252,4.637376,6.02684,short
1504,2008-01-07,AAPL,5.357593,5.164871,5.430276,-0.013385,0.024475,36.547832,5.522205,4.992085,4.238845,6.171195,5.360235,4.637376,6.02684,short
2005,2008-01-08,AAPL,5.164871,5.410674,5.357593,-0.035972,0.024816,36.241930,5.515250,5.007573,4.250567,6.214129,5.247783,4.637376,6.02684,buy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2118730,2024-10-21,AAPL,236.480000,235.860000,235.000000,0.006298,0.012880,66.655846,225.934400,220.221050,199.802290,236.515660,221.650340,216.320000,236.48000,sell
2119231,2024-10-22,AAPL,235.860000,230.760000,236.480000,-0.002622,0.012744,64.702080,226.301000,220.668960,200.079120,237.478500,221.536500,216.320000,236.48000,short
2119732,2024-10-23,AAPL,230.760000,230.570000,235.860000,-0.021623,0.013733,57.298534,226.490800,221.056290,200.308620,237.575040,221.878950,216.320000,236.48000,short
2120233,2024-10-24,AAPL,230.570000,231.400000,230.760000,-0.000823,0.013688,55.555565,226.667800,221.423930,200.539280,237.665250,222.093750,216.320000,236.48000,buy


### Create Multiple Versions of Dataset
___

In [20]:
main_data = historical_data.copy().reset_index(drop=True)

# Create multiple versions of the dataset

# Data with out dates and tickers (Set as index for reference)
data_v1 = main_data.copy().set_index(["Date", "Ticker"])

# Data with dates seperated into year, month, and day
data_v2 = main_data.copy()
data_v2["Date"] = pd.to_datetime(data_v2["Date"])
data_v2["Year"] = data_v2["Date"].dt.year
data_v2["Month"] = data_v2["Date"].dt.month
data_v2["Day"] = data_v2["Date"].dt.day
data_v2 = data_v2.set_index(["Date", "Ticker"])

In [21]:
data_v2

Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2008-01-02,A,23.256388,23.025743,23.538280,-0.011976,0.015704,48.827590,23.314175,23.299887,23.564934,24.727251,22.540236,21.392029,24.351946,short,2008,1,2
2008-01-02,AAPL,5.876342,5.879055,5.974057,-0.016357,0.018937,59.067350,5.518483,4.939064,4.197630,6.135833,5.403559,4.637376,6.026840,buy,2008,1,2
2008-01-02,ABT,18.130209,18.019754,18.240662,-0.006055,0.010484,34.677372,18.138460,17.628250,17.709028,19.233107,18.221804,16.775562,19.134018,short,2008,1,2
2008-01-02,ACGL,7.608889,7.764444,7.816667,-0.026581,0.016022,45.154190,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778,buy,2008,1,2
2008-01-02,ACN,26.437077,25.982521,26.415081,0.000833,0.024039,54.812176,26.577982,27.784420,28.471031,28.227203,24.273775,24.765512,29.215677,sell,2008,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-24,XYL,130.650000,130.410000,131.720000,-0.008123,0.009364,41.637012,133.559020,134.482200,130.753770,138.216640,130.534350,126.710000,137.530000,short,2024,10,24
2024-10-24,YUM,133.160000,133.040000,134.020000,-0.006417,0.010962,32.368100,134.692780,133.522200,133.979570,140.235460,130.624540,129.710000,139.920000,short,2024,10,24
2024-10-24,ZBH,104.000000,102.330000,104.700000,-0.006686,0.010517,47.462685,107.512610,108.075780,115.475070,108.185260,101.411740,101.770000,115.912370,short,2024,10,24
2024-10-24,ZBRA,362.050000,359.970000,368.090000,-0.016409,0.010532,44.399130,355.555600,336.318500,310.517850,379.328800,361.369200,320.770000,377.680000,short,2024,10,24


#### Select which version of the data to work with

In [22]:
select_data = data_v2.copy()

select_columns_to_drop = ["Action", "Previous Day Close", "SMA_50", "Resistance", "Upper Band", "SMA_200"]

data = select_data.drop(columns=select_columns_to_drop)

print("Shape:", data.shape)
data.head()

Shape: (1978479, 11)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2008-01-02,A,23.256388,23.025743,-0.011976,0.015704,48.82759,23.299887,22.540236,21.392029,2008,1,2
2008-01-02,AAPL,5.876342,5.879055,-0.016357,0.018937,59.06735,4.939064,5.403559,4.637376,2008,1,2
2008-01-02,ABT,18.130209,18.019754,-0.006055,0.010484,34.677372,17.62825,18.221804,16.775562,2008,1,2
2008-01-02,ACGL,7.608889,7.764444,-0.026581,0.016022,45.15419,7.878933,7.378535,7.463333,2008,1,2
2008-01-02,ACN,26.437077,25.982521,0.000833,0.024039,54.812176,27.78442,24.273775,24.765512,2008,1,2


### Split data features `X` and target `y`
___

In [23]:
# Split the data into features (X) and target (y)
X = data.drop(columns="Next Day Close")

y = data["Next Day Close"]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1978479, 10)
Shape of y: (1978479,)


### Feature Engineering
___

#### Inspect Multicollinearity using VIF

In [24]:
# Perform correlation matrix of X
X.corr().style.background_gradient(cmap="coolwarm")

Unnamed: 0,Adjusted Close,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Adjusted Close,1.0,0.004575,-0.049952,0.02353,0.995365,0.998308,0.996711,0.259818,0.004643,5.4e-05
Return,0.004575,1.0,0.026863,0.21493,-0.00179,-0.002142,-0.001604,0.001578,0.002862,-0.000203
Volatility,-0.049952,0.026863,1.0,-0.095813,-0.039082,-0.059477,-0.057813,-0.13781,0.008778,-0.003205
RSI,0.02353,0.21493,-0.095813,1.0,0.000401,0.007452,0.004387,0.011158,-0.00254,0.003985
SMA_100,0.995365,-0.00179,-0.039082,0.000401,1.0,0.995424,0.997062,0.262989,0.005244,0.000356
Lower Band,0.998308,-0.002142,-0.059477,0.007452,0.995424,1.0,0.997957,0.259315,0.004509,8.2e-05
Support,0.996711,-0.001604,-0.057813,0.004387,0.997062,0.997957,1.0,0.260094,0.004889,0.000432
Year,0.259818,0.001578,-0.13781,0.011158,0.262989,0.259315,0.260094,1.0,-0.028679,-0.001871
Month,0.004643,0.002862,0.008778,-0.00254,0.005244,0.004509,0.004889,-0.028679,1.0,-0.001247
Day,5.4e-05,-0.000203,-0.003205,0.003985,0.000356,8.2e-05,0.000432,-0.001871,-0.001247,1.0


##### **Note: It is recommended to remove `["Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]` after VIF inspection...**

In [25]:
# Create a function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Set the threshold for VIF (Based on the domain knowledge of the data)
THRESHOLD = 2000

def highlight_vif(row):
    return ["background-color: black" if value < THRESHOLD else "" for value in row]

def calc_vif(X):
    # Calculate VIF values
    vif_values = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    # Create a dataframe to display the VIF values
    vif = pd.DataFrame(
        data={"VIF": vif_values},
        index=X.columns
    )

    return vif

vif_df = calc_vif(X).sort_values("VIF")
vif_df.style.apply(highlight_vif)

Unnamed: 0,VIF
Return,1.055047
Volatility,3.424442
Day,4.226557
Month,4.641133
RSI,13.036789
Year,23.55678
SMA_100,236.750592
Adjusted Close,438.247347
Support,467.769801
Lower Band,620.094715


### Data Splitting
___

In [26]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1582783, 10)
Shape of X_test: (395696, 10)
Shape of y_train: (1582783,)
Shape of y_test: (395696,)


#### Inspect Probability Values `(p-values)`|

In [27]:
# Check P-Values
import statsmodels.api as sm

ols_model = sm.OLS(
    y_train,
    X_train
).fit()

ols_model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2ab212afeb0>

##### **Note: It is recommended to remove `["Day"]` after p-value inspection...**

In [28]:
def highlight_p_values(row):
    return ["background-color: black" if value <= 0.05 else "" for value in row]

p_values_df = ols_model.pvalues.sort_values().to_frame(name="p_value")

p_values_df.style.apply(highlight_p_values)

Unnamed: 0,p_value
Adjusted Close,0.0
SMA_100,0.0
Lower Band,0.0
Support,0.0
Year,0.0
RSI,0.0
Return,0.0
Volatility,3e-06
Month,0.000131
Day,0.482955


### Model Training
___


#### Scale the data using `StandardScaler`

In [29]:
# Scale using StandardScaler
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train_scaled shape:", y_train_scaled.shape)
print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1582783, 10)
X_test_scaled shape: (395696, 10)
y_train_scaled shape: (1582783, 1)
y_test_scaled shape: (395696, 1)


In [30]:
def adj_r2_score(model, X, y):
    r2 = model.score(X, y)
    n_cols = X.shape[1]
    return 1 - (1 - r2) * (len(y) -1) / (len(y) - n_cols - 1)

In [31]:
lin_reg = LinearRegression(
    n_jobs=-1 # Use all processors
)

lin_reg.fit(X_train_scaled, y_train_scaled)


In [32]:
def adj_r2_score(model, X, y):
    r2 = model.score(X, y)
    n_cols = X.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [33]:
y_predict_scaled = lin_reg.predict(X_test_scaled)

y_predict_unscaled = y_scaler.inverse_transform(y_predict_scaled)
y_test_unscaled = y_scaler.inverse_transform(y_test_scaled)

mse = mean_squared_error(y_test_unscaled, y_predict_unscaled)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_unscaled, y_predict_unscaled)
adj_r2 = adj_r2_score(lin_reg, X_test_scaled, y_test_scaled)

print_title("Linear Regression Model Evaluation", closed_corners=False)
print_label("Mean Squared Error:", mse)
print_label("Root Mean Squared Error:", rmse)
print_label("R-Squared:", r2)
print_label("Adjusted R-Squared:", adj_r2, closed_corners=True) 

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m              Linear Regression Model Evaluation               [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mMean Squared Error:            |           15.53391          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mRoot Mean Squared Error:       |           3.94131           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mR-Squared:                     |           0.99965           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mAdjusted R-Squared:            |           0.99965           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


In [34]:
y_test_unscaled

array([[ 79.33535 ],
       [ 76.643776],
       [215.37453 ],
       ...,
       [ 65.32262 ],
       [ 32.881668],
       [ 84.435646]])

In [35]:
cv_scores = cross_val_score(
    LinearRegression(
        n_jobs=-1
    ),
    X_train_scaled,
    y_train_scaled,
    scoring="r2",
    cv=10
)

print_title("Cross Validation Scores", closed_corners=False)

for index, score in enumerate(cv_scores):
    print_label(f"Fold {index+1}:", score)

print_label("", "")
print_label("Mean R^2 Score:", cv_scores.mean())
print_label("Standard Deviation:", cv_scores.std(), closed_corners=True)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m                    Cross Validation Scores                    [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mFold 1:                        |           0.99961           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mFold 2:                        |           0.99960           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mFold 3:                        |           0.99965           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mFold 4:                        |           0.99959           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mFold 5:                        |           0.99951           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mFold 6:                        |           0.99961           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

### Predict tomorrows `Adjusted Close`
___

In [36]:
def preprocess_todays_data(df, columns_to_drop):
    # Convert 'Date' column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from 'Date' column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Drop specified columns
    df = df.drop(columns=columns_to_drop)
    
    # Set 'Date' and 'Ticker' as the index
    df = df.set_index(["Date", "Ticker"])
    
    # Drop 'Next Day Close' column
    df = df.drop(columns="Next Day Close")
    
    return df

X_to_predict = todays_data.copy()

X_to_predict = preprocess_todays_data(X_to_predict, select_columns_to_drop)

X_to_predict_scaled = X_scaler.transform(X_to_predict)

print("Shape of todays_data_clean_scaled:", X_to_predict_scaled.shape)
X_to_predict_scaled[:5]

Shape of todays_data_clean_scaled: (501, 10)


array([[ 0.21029221, -0.21050541, -0.50676969, -2.04849203,  0.26539134,
         0.25211298,  0.28301857,  1.62443221,  1.02782802,  1.06190002],
       [ 0.71656911,  0.1299215 , -0.38651463,  0.79365625,  0.70975971,
         0.74646253,  0.76314191,  1.62443221,  1.02782802,  1.06190002],
       [ 0.49863601, -0.4592691 , -0.79119435, -0.85020721,  0.50742677,
         0.5520643 ,  0.59719398,  1.62443221,  1.02782802,  1.06190002],
       [ 0.23249558,  0.58958269, -0.28334676,  0.37484342,  0.24850492,
         0.22527628,  0.19452787,  1.62443221,  1.02782802,  1.06190002],
       [ 0.13068012, -0.90781959, -0.64725082,  0.08741691,  0.12313012,
         0.15204863,  0.17207307,  1.62443221,  1.02782802,  1.06190002]])

In [37]:
y_to_predict_scaled = lin_reg.predict(X_to_predict_scaled)

y_to_predict_unscaled = y_scaler.inverse_transform(y_to_predict_scaled)

print("Shape of y_to_predict_unscaled:", y_to_predict_unscaled.shape)
y_to_predict_unscaled[:5]

Shape of y_to_predict_unscaled: (501, 1)


array([[130.28101605],
       [231.58869119],
       [187.95440641],
       [134.72715696],
       [114.33049032]])

In [48]:
prediction_df = todays_data.copy()

prediction_df["Next Day Close"] = y_to_predict_unscaled.round(3)

prediction_df = prediction_df[["Date", "Ticker", "Adjusted Close", "Next Day Close"]]

prediction_df = prediction_df.set_index(["Date", "Ticker"])

print("Shape:", prediction_df.shape)
display(prediction_df.head(20))
display(prediction_df.tail(20))

Shape: (501, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-25,A,130.16,130.281
2024-10-25,AAPL,231.4,231.589
2024-10-25,ABBV,187.82,187.954
2024-10-25,ABNB,134.6,134.727
2024-10-25,ABT,114.24,114.33
2024-10-25,ACGL,105.28,105.396
2024-10-25,ACN,360.79,361.005
2024-10-25,ADBE,483.73,484.73
2024-10-25,ADI,230.17,230.415
2024-10-25,ADM,56.57,56.591


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-25,WBD,7.59,7.549
2024-10-25,WDC,69.455,69.447
2024-10-25,WEC,97.17,97.225
2024-10-25,WELL,130.19,130.241
2024-10-25,WFC,64.52,64.537
2024-10-25,WM,206.8,207.08
2024-10-25,WMB,52.51,52.497
2024-10-25,WMT,82.51,82.56
2024-10-25,WRB,58.17,58.224
2024-10-25,WST,307.92,308.129


In [49]:
# Make index column a regular column
prediction_df.reset_index(inplace=True)

prediction_df

Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close
0,2024-10-25,A,130.16,130.281
1,2024-10-25,AAPL,231.40,231.589
2,2024-10-25,ABBV,187.82,187.954
3,2024-10-25,ABNB,134.60,134.727
4,2024-10-25,ABT,114.24,114.330
...,...,...,...,...
496,2024-10-25,XYL,130.41,130.573
497,2024-10-25,YUM,133.04,133.129
498,2024-10-25,ZBH,102.33,102.496
499,2024-10-25,ZBRA,359.97,360.426


In [50]:
file_name = "lin_reg_predict.zip"
file_path = f"../../../data/raw_data/{file_name}"

save_data(prediction_df, file_path)

[1m[35m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[35m║[0m[1m[95m File `lin_reg_predict.zip` already exists. Overwriting file.  [0m[1m[35m║[0m
[1m[35m╚═══════════════════════════════════════════════════════════════╝[0m
[1m[32m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[32m║[0m[1m[92m        File saved and zipped as `lin_reg_predict.zip`         [0m[1m[32m║[0m
[1m[32m╚═══════════════════════════════════════════════════════════════╝[0m
