In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import split_dataset_by_date, clean_historical_data, check_tickers_for_missing_values
from utilities import calc_vif, calc_p_values, calc_correlation, highlight_vif, highlight_p_values, evaluate_regression_model, evaluate_cross_validation
from utilities import load_data, save_data

In [2]:
# Data manipulation and analysis
import pandas as pd

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Machine learning models (regression)
from sklearn.linear_model import LinearRegression

In [3]:
file_name = "sp500_adj_close_raw_with_nas"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96mFile `test_w_na.csv.bz2` loaded from `sp500_adj_close_raw_with_nas.zip`[0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


### Data Pre-Processing
___

#### Split todays data (For prediction) and historical data (For training)

In [4]:
todays_date = "2024-10-25"

historical_data, todays_data = split_dataset_by_date(raw_data, todays_date)

print("Todays Date:", todays_date)

Todays Date: 2024-10-25


#### Ensure Missing Values on Todays Data is what we are Predicting

In [5]:
todays_data.isnull().sum()

Date                    0
Ticker                  0
Adjusted Close          0
Next Day Close        501
Previous Day Close      0
Return                  0
Volatility              0
RSI                     0
SMA_50                  0
SMA_100                 0
SMA_200                 0
Upper Band              0
Lower Band              0
Support                 0
Resistance              0
Action                501
dtype: int64

#### Handle Missing Values (NA's)


In [6]:
historical_data = clean_historical_data(historical_data)

historical_data.isnull().sum()

Date                  0
Ticker                0
Adjusted Close        0
Next Day Close        0
Previous Day Close    0
Return                0
Volatility            0
RSI                   0
SMA_50                0
SMA_100               0
SMA_200               0
Upper Band            0
Lower Band            0
Support               0
Resistance            0
Action                0
dtype: int64

In [7]:
tickers_no_missing_values, tickers_with_missing_values = check_tickers_for_missing_values(historical_data)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m          Tickers that do not have any missing values          [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of unique tickers:      |          501.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with no m... |          501.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with miss... |           0.00000           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


### Exploratory Data Analysis (EDA):
___

#### Todays Data:

In [8]:
print("Shape:", todays_data.shape)

Shape: (501, 16)


#### Historical Data:

In [9]:
print("Shape:", historical_data.shape)

Shape: (1978479, 16)


#### Inspect Tickers Individually

In [10]:
# Inspect Cleaned data zeroing in on individual tickers
select_df = raw_data.copy()

select_ticker = select_df["Ticker"] == "AAPL"

select_df[select_ticker]

Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
1,2008-01-02,AAPL,5.876342,5.879055,5.974057,-0.016357,0.018937,59.067350,5.518483,4.939064,4.197630,6.135833,5.403559,4.637376,6.02684,buy
502,2008-01-03,AAPL,5.879055,5.430276,5.876342,0.000462,0.018287,56.259520,5.530891,4.960155,4.213231,6.122779,5.462215,4.637376,6.02684,sell
1003,2008-01-04,AAPL,5.430276,5.357593,5.879055,-0.076335,0.025363,37.311500,5.527205,4.975917,4.226226,6.142305,5.426252,4.637376,6.02684,short
1504,2008-01-07,AAPL,5.357593,5.164871,5.430276,-0.013385,0.024475,36.547832,5.522205,4.992085,4.238845,6.171195,5.360235,4.637376,6.02684,short
2005,2008-01-08,AAPL,5.164871,5.410674,5.357593,-0.035972,0.024816,36.241930,5.515250,5.007573,4.250567,6.214129,5.247783,4.637376,6.02684,buy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2118730,2024-10-21,AAPL,236.480000,235.860000,235.000000,0.006298,0.012880,66.655846,225.934400,220.221050,199.802290,236.515660,221.650340,216.320000,236.48000,sell
2119231,2024-10-22,AAPL,235.860000,230.760000,236.480000,-0.002622,0.012744,64.702080,226.301000,220.668960,200.079120,237.478500,221.536500,216.320000,236.48000,short
2119732,2024-10-23,AAPL,230.760000,230.570000,235.860000,-0.021623,0.013733,57.298534,226.490800,221.056290,200.308620,237.575040,221.878950,216.320000,236.48000,short
2120233,2024-10-24,AAPL,230.570000,231.400000,230.760000,-0.000823,0.013688,55.555565,226.667800,221.423930,200.539280,237.665250,222.093750,216.320000,236.48000,buy


### Create Multiple Versions of Dataset
___

In [11]:
# Data with dates and without tickers (Set as index for reference)
def prepare_data_v2(main_data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = main_data.copy().reset_index(drop=True)
    
    # Convert the `Date` column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from the `Date` column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Set the index to `Date` and `Ticker`
    df = df.set_index(["Date", "Ticker"])
    
    return df

main_data = prepare_data_v2(historical_data.copy().reset_index(drop=True))

#### Select which version of the data to work with

In [12]:
select_data = main_data.copy()

select_columns_to_drop = ["Action", "Previous Day Close", "SMA_50", "Resistance", "Upper Band", "SMA_200"]

data = select_data.drop(columns=select_columns_to_drop)

print("Shape:", data.shape)
data.head()

Shape: (1978479, 11)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2008-01-02,A,23.256388,23.025743,-0.011976,0.015704,48.82759,23.299887,22.540236,21.392029,2008,1,2
2008-01-02,AAPL,5.876342,5.879055,-0.016357,0.018937,59.06735,4.939064,5.403559,4.637376,2008,1,2
2008-01-02,ABT,18.130209,18.019754,-0.006055,0.010484,34.677372,17.62825,18.221804,16.775562,2008,1,2
2008-01-02,ACGL,7.608889,7.764444,-0.026581,0.016022,45.15419,7.878933,7.378535,7.463333,2008,1,2
2008-01-02,ACN,26.437077,25.982521,0.000833,0.024039,54.812176,27.78442,24.273775,24.765512,2008,1,2


### Split data features `X` and target `y`
___

In [13]:
# Split the data into features (X) and target (y)
X = data.drop(columns="Next Day Close")

y = data["Next Day Close"]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1978479, 10)
Shape of y: (1978479,)


### Feature Engineering
___

#### Inspect Multicollinearity using VIF

In [14]:
# Perform correlation matrix of X
calc_correlation(X)

Unnamed: 0,Adjusted Close,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Adjusted Close,1.0,0.004575,-0.049952,0.02353,0.995365,0.998308,0.996711,0.259818,0.004643,5.4e-05
Return,0.004575,1.0,0.026863,0.21493,-0.00179,-0.002142,-0.001604,0.001578,0.002862,-0.000203
Volatility,-0.049952,0.026863,1.0,-0.095813,-0.039082,-0.059477,-0.057813,-0.13781,0.008778,-0.003205
RSI,0.02353,0.21493,-0.095813,1.0,0.000401,0.007452,0.004387,0.011158,-0.00254,0.003985
SMA_100,0.995365,-0.00179,-0.039082,0.000401,1.0,0.995424,0.997062,0.262989,0.005244,0.000356
Lower Band,0.998308,-0.002142,-0.059477,0.007452,0.995424,1.0,0.997957,0.259315,0.004509,8.2e-05
Support,0.996711,-0.001604,-0.057813,0.004387,0.997062,0.997957,1.0,0.260094,0.004889,0.000432
Year,0.259818,0.001578,-0.13781,0.011158,0.262989,0.259315,0.260094,1.0,-0.028679,-0.001871
Month,0.004643,0.002862,0.008778,-0.00254,0.005244,0.004509,0.004889,-0.028679,1.0,-0.001247
Day,5.4e-05,-0.000203,-0.003205,0.003985,0.000356,8.2e-05,0.000432,-0.001871,-0.001247,1.0


##### **Note: It is recommended to remove `["Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]` after VIF inspection...**

In [15]:
# Perform Variance Inflation Factor (VIF) analysis
vif = calc_vif(X)

vif.style.apply(lambda x: highlight_vif(x, threshold=2000))

Unnamed: 0,VIF
Return,1.055047
Volatility,3.424442
Day,4.226557
Month,4.641133
RSI,13.036789
Year,23.55678
SMA_100,236.750592
Adjusted Close,438.247347
Support,467.769801
Lower Band,620.094715


### Data Splitting
___

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1582783, 10)
Shape of X_test: (395696, 10)
Shape of y_train: (1582783,)
Shape of y_test: (395696,)


#### Inspect Probability Values `(p-values)`|

##### **Note: It is recommended to remove `["Day"]` after p-value inspection...**

In [17]:
# Check P-Values
import statsmodels.api as sm

p_values, ols_model = calc_p_values(X_train, y_train)

p_values.style.apply(highlight_p_values)

Unnamed: 0,p_value
Adjusted Close,0.0
SMA_100,0.0
Lower Band,0.0
Support,0.0
Year,0.0
RSI,0.0
Return,0.0
Volatility,3e-06
Month,0.000131
Day,0.482955


### Model Training
___


#### Scale the data using `StandardScaler`

In [18]:
# Scale using StandardScaler
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train_scaled shape:", y_train_scaled.shape)
print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1582783, 10)
X_test_scaled shape: (395696, 10)
y_train_scaled shape: (1582783, 1)
y_test_scaled shape: (395696, 1)


In [19]:
lin_reg = LinearRegression(
    n_jobs=-1 # Use all processors
)

lin_reg.fit(X_train_scaled, y_train_scaled)


In [20]:
# Predict the target values using the testing data
y_predict_scaled = lin_reg.predict(X_test_scaled)
y_train_predict_scaled = lin_reg.predict(X_train_scaled)

# Unscale the testing data and predictions
y_predict_unscaled = y_scaler.inverse_transform(y_predict_scaled)
y_test_unscaled = y_scaler.inverse_transform(y_test_scaled)

# Unscale the training data and predictions
y_train_unscaled = y_scaler.inverse_transform(y_train_scaled)
y_train_predict_unscaled = y_scaler.inverse_transform(y_train_predict_scaled)

In [21]:
evaluate_regression_model(
    lin_reg,
    "Linear Regression",
    X_train_scaled,
    y_train_scaled,
    X_test_scaled,
    y_test_scaled,
    y_train_unscaled,
    y_test_unscaled,
    y_train_predict_unscaled,
    y_predict_unscaled
)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[96m              Linear Regression Model Evaluation               [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97m                               |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[33mTraining Data Metrics          |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mMean Squared Error (Train):    |           16.37386          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mRoot Mean Squared Error (Tr... |           4.04646           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mR-Squared (Train):             |           0.99959           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mAdjusted R-Squared (Train):    |           0.99959           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

In [22]:
cv_scores = cross_val_score(
    LinearRegression(
        n_jobs=-1
    ),
    X_train_scaled,
    y_train_scaled,
    scoring="r2",
    cv=10
)

In [23]:
evaluate_cross_validation(cv_scores, "Linear Regression")

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[92m           Linear Regression Cross Validation Scores           [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 1:                        |           0.99961           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 2:                        |           0.99960           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 3:                        |           0.99965           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 4:                        |           0.99959           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 5:                        |           0.99951           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 6:                        |           0.99961           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

In [24]:
import pickle

# Save model path
model_name = "linear_regression_close.pkl"
model_path = f"../../../models/{model_name}"

# Save the model to a file
with open(model_path, "wb") as file:
    pickle.dump(lin_reg, file)

print(f"Model saved to {model_path}")

Model saved to ../../../models/linear_regression_close.pkl


### Predict tomorrows `Adjusted Close`
___
___
___

In [25]:
def preprocess_todays_data(df, columns_to_drop):
    # Convert 'Date' column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from 'Date' column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Drop specified columns
    df = df.drop(columns=columns_to_drop)
    
    # Set 'Date' and 'Ticker' as the index
    df = df.set_index(["Date", "Ticker"])
    
    # Drop 'Next Day Close' column
    df = df.drop(columns="Next Day Close")
    
    return df

X_to_predict = todays_data.copy()

X_to_predict = preprocess_todays_data(X_to_predict, select_columns_to_drop)

X_to_predict_scaled = X_scaler.transform(X_to_predict)

print("Shape of todays_data_clean_scaled:", X_to_predict_scaled.shape)
X_to_predict_scaled[:5]

Shape of todays_data_clean_scaled: (501, 10)


array([[ 0.21029221, -0.21050541, -0.50676969, -2.04849203,  0.26539134,
         0.25211298,  0.28301857,  1.62443221,  1.02782802,  1.06190002],
       [ 0.71656911,  0.1299215 , -0.38651463,  0.79365625,  0.70975971,
         0.74646253,  0.76314191,  1.62443221,  1.02782802,  1.06190002],
       [ 0.49863601, -0.4592691 , -0.79119435, -0.85020721,  0.50742677,
         0.5520643 ,  0.59719398,  1.62443221,  1.02782802,  1.06190002],
       [ 0.23249558,  0.58958269, -0.28334676,  0.37484342,  0.24850492,
         0.22527628,  0.19452787,  1.62443221,  1.02782802,  1.06190002],
       [ 0.13068012, -0.90781959, -0.64725082,  0.08741691,  0.12313012,
         0.15204863,  0.17207307,  1.62443221,  1.02782802,  1.06190002]])

In [26]:
y_to_predict_scaled = lin_reg.predict(X_to_predict_scaled)

y_to_predict_unscaled = y_scaler.inverse_transform(y_to_predict_scaled)

print("Shape of y_to_predict_unscaled:", y_to_predict_unscaled.shape)
y_to_predict_unscaled[:5]

Shape of y_to_predict_unscaled: (501, 1)


array([[130.28101605],
       [231.58869119],
       [187.95440641],
       [134.72715696],
       [114.33049032]])

In [27]:
prediction_df = todays_data.copy()

prediction_df["Next Day Close"] = y_to_predict_unscaled.round(3)

prediction_df = prediction_df[["Date", "Ticker", "Adjusted Close", "Next Day Close"]]

prediction_df = prediction_df.set_index(["Date", "Ticker"])

print("Shape:", prediction_df.shape)
display(prediction_df.head(20))
display(prediction_df.tail(20))

Shape: (501, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-25,A,130.16,130.281
2024-10-25,AAPL,231.4,231.589
2024-10-25,ABBV,187.82,187.954
2024-10-25,ABNB,134.6,134.727
2024-10-25,ABT,114.24,114.33
2024-10-25,ACGL,105.28,105.396
2024-10-25,ACN,360.79,361.005
2024-10-25,ADBE,483.73,484.73
2024-10-25,ADI,230.17,230.415
2024-10-25,ADM,56.57,56.591


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-25,WBD,7.59,7.549
2024-10-25,WDC,69.455,69.447
2024-10-25,WEC,97.17,97.225
2024-10-25,WELL,130.19,130.241
2024-10-25,WFC,64.52,64.537
2024-10-25,WM,206.8,207.08
2024-10-25,WMB,52.51,52.497
2024-10-25,WMT,82.51,82.56
2024-10-25,WRB,58.17,58.224
2024-10-25,WST,307.92,308.129


In [28]:
# Make index column a regular column
prediction_df.reset_index(inplace=True)

prediction_df

Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close
0,2024-10-25,A,130.16,130.281
1,2024-10-25,AAPL,231.40,231.589
2,2024-10-25,ABBV,187.82,187.954
3,2024-10-25,ABNB,134.60,134.727
4,2024-10-25,ABT,114.24,114.330
...,...,...,...,...
496,2024-10-25,XYL,130.41,130.573
497,2024-10-25,YUM,133.04,133.129
498,2024-10-25,ZBH,102.33,102.496
499,2024-10-25,ZBRA,359.97,360.426


In [29]:
file_name = "lin_reg_predict.zip"
file_path = f"../../../data/raw_data/{file_name}"

save_data(prediction_df, file_path)

[1m[35m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[35m║[0m[1m[95m File `lin_reg_predict.zip` already exists. Overwriting file.  [0m[1m[35m║[0m
[1m[35m╚═══════════════════════════════════════════════════════════════╝[0m
[1m[32m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[32m║[0m[1m[92m        File saved and zipped as `lin_reg_predict.zip`         [0m[1m[32m║[0m
[1m[32m╚═══════════════════════════════════════════════════════════════╝[0m
