In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import load_data, save_data
from utilities import temporal_train_test_split
from utilities import print_title, print_label

In [2]:
# Data manipulation and analysis
import pandas as pd

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Model evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

# Machine learning models (regression)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Ensemble methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
def split_dataset_by_date(raw_data: pd.DataFrame, todays_date: str):
    # Filter data by today's date
    filter_data_by_date = raw_data["Date"] == todays_date
    
    # Create a new dataframe with today's data
    todays_data = raw_data[filter_data_by_date].reset_index(drop=True)
    
    # Create a new dataframe with historical data (excluding today's data)
    historical_data = raw_data[~filter_data_by_date].reset_index(drop=True)
    
    return historical_data, todays_data

In [4]:
file_name = "sp500_adj_close_raw_with_nas"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96mFile `test_w_na.csv.bz2` loaded from `sp500_adj_close_raw_with_nas.zip`[0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


### Data Pre-Processing
___

#### Split todays data (For prediction) and historical data (For training)

In [5]:
todays_date = "2024-10-25"

historical_data, todays_data = split_dataset_by_date(raw_data, todays_date)

print("Todays Date:", todays_date)

Todays Date: 2024-10-25


#### Ensure there are missing values on Todays Data is what we are predicting...

In [6]:
todays_data.isnull().sum()

Date                    0
Ticker                  0
Adjusted Close          0
Next Day Close        501
Previous Day Close      0
Return                  0
Volatility              0
RSI                     0
SMA_50                  0
SMA_100                 0
SMA_200                 0
Upper Band              0
Lower Band              0
Support                 0
Resistance              0
Action                501
dtype: int64

#### Handle missing values (NA's) on the historical data used to train and test...

In [7]:
def clean_historical_data(historical_data: pd.DataFrame) -> pd.DataFrame:
    # Remove all rows where `Adjusted Close` is 0
    historical_data = historical_data[historical_data["Adjusted Close"] != 0].reset_index(drop=True)
    
    # Remove all rows where `Volatility` is NaN
    historical_data = historical_data.dropna(subset=["Volatility"]).reset_index(drop=True)
    
    # Backfill the `RSI` column
    historical_data["RSI"] = historical_data["RSI"].bfill()
    
    # Backfill the `Action` column
    historical_data["Action"] = historical_data["Action"].bfill()
    
    return historical_data

historical_data = clean_historical_data(historical_data)

historical_data.isnull().sum()

Date                  0
Ticker                0
Adjusted Close        0
Next Day Close        0
Previous Day Close    0
Return                0
Volatility            0
RSI                   0
SMA_50                0
SMA_100               0
SMA_200               0
Upper Band            0
Lower Band            0
Support               0
Resistance            0
Action                0
dtype: int64

### Exploratory Data Analysis (EDA):
___

#### Todays Data:

In [8]:
print("Today's Data Shape:", todays_data.shape)

Today's Data Shape: (501, 16)


#### Historical Data:

In [9]:
print("Historical Data Shape:", historical_data.shape)

Historical Data Shape: (1978479, 16)


### Create Multiple Versions of Dataset
___

#### Select which version of the data to work with

In [10]:
main_data = historical_data.copy().reset_index(drop=True)

# Create multiple versions of the dataset

# Data with out dates and tickers (Set as index for reference)
data_v1 = main_data.copy().set_index(["Date", "Ticker"])

# Data with dates and without tickers (Set as index for reference)
def prepare_data_v2(main_data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = main_data.copy().reset_index(drop=True)
    
    # Convert the `Date` column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from the `Date` column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Set the index to `Date` and `Ticker`
    df = df.set_index(["Date", "Ticker"])
    
    return df

data_v2 = prepare_data_v2(main_data)

##### **Note: It is recommended to remove `["Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]` after VIF inspection...**

##### **Note: It is recommended to remove `["Day"]` after p-value inspection...**

In [11]:
select_data = data_v2.copy()

select_columns_to_drop = ["Action", "Previous Day Close", "SMA_50", "Resistance", "Upper Band", "SMA_200", "Day"]

data = select_data.drop(columns=select_columns_to_drop)

print("Shape:", data.shape)
data.head()

Shape: (1978479, 10)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-01-02,A,23.256388,23.025743,-0.011976,0.015704,48.82759,23.299887,22.540236,21.392029,2008,1
2008-01-02,AAPL,5.876342,5.879055,-0.016357,0.018937,59.06735,4.939064,5.403559,4.637376,2008,1
2008-01-02,ABT,18.130209,18.019754,-0.006055,0.010484,34.677372,17.62825,18.221804,16.775562,2008,1
2008-01-02,ACGL,7.608889,7.764444,-0.026581,0.016022,45.15419,7.878933,7.378535,7.463333,2008,1
2008-01-02,ACN,26.437077,25.982521,0.000833,0.024039,54.812176,27.78442,24.273775,24.765512,2008,1


### Split data features `X` and target `y`
___

In [12]:
# Split the data into features (X) and target (y)
X = data.drop(columns="Next Day Close")

y = data["Next Day Close"]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1978479, 9)
Shape of y: (1978479,)


### Data Splitting
___

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1582783, 9)
Shape of X_test: (395696, 9)
Shape of y_train: (1582783,)
Shape of y_test: (395696,)


### Scaling the Data
___

#### Scale the data using `StandardScaler`

In [14]:
# Scale using StandardScaler
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train_scaled shape:", y_train_scaled.shape)
print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1582783, 9)
X_test_scaled shape: (395696, 9)
y_train_scaled shape: (1582783, 1)
y_test_scaled shape: (395696, 1)


### Model Training
___


In [17]:
from sklearn.model_selection import GridSearchCV

ran_for = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    verbose=2,
)

ran_for.fit(X_train_scaled, y_train_scaled.reshape(-1))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.2min


building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.0min finished


### Model Evaluation
___

In [19]:
def adj_r2_score(r2, X):
    n_cols = X.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [20]:
y_predict_scaled = ran_for.predict(X_test_scaled)

y_predict_unscaled = y_scaler.inverse_transform(y_predict_scaled.reshape(-1, 1))
y_test_unscaled = y_scaler.inverse_transform(y_test_scaled.reshape(-1, 1))

mse = mean_squared_error(y_test_unscaled, y_predict_unscaled)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_unscaled, y_predict_unscaled)
adj_r2 = adj_r2_score(r2, X_test_scaled)

print_title("XGBoost Regressor", closed_corners=False)
print_label("Mean Squared Error:", mse)
print_label("Root Mean Squared Error:", rmse)
print_label("R-Squared (Training Data):", ran_for.score(X_train_scaled, y_train_scaled))
print_label("R-Squared (Testing Data):", r2)
print_label("Adjusted R-Squared:", adj_r2, closed_corners=True) 

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    1.4s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   22.3s finished


[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m                       XGBoost Regressor                       [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mMean Squared Error:            |           17.46137          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mRoot Mean Squared Error:       |           4.17868           [0m[1m[90m ║[0m[0m


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    4.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   40.7s finished


[1m[97m[1m[90m║ [0m[1m[97mR-Squared (Training Data):     |           0.99994           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mR-Squared (Testing Data):      |           0.99960           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mAdjusted R-Squared:            |           0.99960           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


In [27]:
cv_scores = cross_val_score(
    ran_for,
    X_train_scaled,
    y_train_scaled,
    cv=5,
    scoring="r2"
)

print_title("Cross Validation Scores", closed_corners=False)

for index, score in enumerate(cv_scores):
    print_label(f"Fold {index+1}:", score)

print_label("", "")
print_label("Mean R^2 Score:", cv_scores.mean())
print_label("Standard Deviation:", cv_scores.std(), closed_corners=True)

  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100

building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100


KeyboardInterrupt: 

### Predict tomorrows `Adjusted Close`
___

In [21]:
def preprocess_todays_data(df, columns_to_drop):
    # Convert 'Date' column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from 'Date' column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Drop specified columns
    df = df.drop(columns=columns_to_drop)
    
    # Set 'Date' and 'Ticker' as the index
    df = df.set_index(["Date", "Ticker"])
    
    # Drop 'Next Day Close' column
    df = df.drop(columns="Next Day Close")
    
    return df

X_to_predict = todays_data.copy()

X_to_predict = preprocess_todays_data(X_to_predict, select_columns_to_drop)

X_to_predict_scaled = X_scaler.transform(X_to_predict)

print("Shape of todays_data_clean_scaled:", X_to_predict_scaled.shape)
X_to_predict_scaled[:5]

Shape of todays_data_clean_scaled: (501, 9)


array([[ 0.21029221, -0.21050541, -0.50676969, -2.04849203,  0.26539134,
         0.25211298,  0.28301857,  1.62443221,  1.02782802],
       [ 0.71656911,  0.1299215 , -0.38651463,  0.79365625,  0.70975971,
         0.74646253,  0.76314191,  1.62443221,  1.02782802],
       [ 0.49863601, -0.4592691 , -0.79119435, -0.85020721,  0.50742677,
         0.5520643 ,  0.59719398,  1.62443221,  1.02782802],
       [ 0.23249558,  0.58958269, -0.28334676,  0.37484342,  0.24850492,
         0.22527628,  0.19452787,  1.62443221,  1.02782802],
       [ 0.13068012, -0.90781959, -0.64725082,  0.08741691,  0.12313012,
         0.15204863,  0.17207307,  1.62443221,  1.02782802]])

In [22]:
y_to_predict_scaled = ran_for.predict(X_to_predict_scaled)

y_to_predict_unscaled = y_scaler.inverse_transform(y_to_predict_scaled.reshape(-1, 1))

print("Shape of y_to_predict_unscaled:", y_to_predict_unscaled.shape)
y_to_predict_unscaled[:5]

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.1s


Shape of y_to_predict_unscaled: (501, 1)


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.6s finished


array([[130.6394186 ],
       [232.1112389 ],
       [188.1922831 ],
       [133.73556546],
       [113.53746655]])

In [23]:
prediction_df = todays_data.copy()

prediction_df["Next Day Close"] = y_to_predict_unscaled.round(3)

prediction_df = prediction_df[["Date", "Ticker", "Adjusted Close", "Next Day Close"]]

prediction_df = prediction_df.set_index(["Date", "Ticker"])

print("Shape:", prediction_df.shape)
display(prediction_df.head(20))
display(prediction_df.tail(20))

Shape: (501, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-25,A,130.16,130.639
2024-10-25,AAPL,231.4,232.111
2024-10-25,ABBV,187.82,188.192
2024-10-25,ABNB,134.6,133.736
2024-10-25,ABT,114.24,113.537
2024-10-25,ACGL,105.28,105.498
2024-10-25,ACN,360.79,360.321
2024-10-25,ADBE,483.73,484.574
2024-10-25,ADI,230.17,229.68
2024-10-25,ADM,56.57,56.607


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-25,WBD,7.59,7.697
2024-10-25,WDC,69.455,69.937
2024-10-25,WEC,97.17,97.211
2024-10-25,WELL,130.19,130.313
2024-10-25,WFC,64.52,64.927
2024-10-25,WM,206.8,207.763
2024-10-25,WMB,52.51,52.344
2024-10-25,WMT,82.51,82.573
2024-10-25,WRB,58.17,58.279
2024-10-25,WST,307.92,310.761


In [24]:
# Make index column a regular column
prediction_df.reset_index(inplace=True)

prediction_df

Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close
0,2024-10-25,A,130.16,130.639
1,2024-10-25,AAPL,231.40,232.111
2,2024-10-25,ABBV,187.82,188.192
3,2024-10-25,ABNB,134.60,133.736
4,2024-10-25,ABT,114.24,113.537
...,...,...,...,...
496,2024-10-25,XYL,130.41,131.201
497,2024-10-25,YUM,133.04,132.853
498,2024-10-25,ZBH,102.33,101.858
499,2024-10-25,ZBRA,359.97,360.141


In [26]:
file_name = "ran_for_predict.zip"
file_path = f"../../../data/raw_data/{file_name}"

save_data(prediction_df, file_path)

[1m[32m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[32m║[0m[1m[92m        File saved and zipped as `ran_for_predict.zip`         [0m[1m[32m║[0m
[1m[32m╚═══════════════════════════════════════════════════════════════╝[0m
