In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import split_dataset_by_date, clean_historical_data, check_tickers_for_missing_values
from utilities import calc_vif, calc_p_values, calc_correlation, highlight_vif, highlight_p_values, evaluate_regression_model, evaluate_cross_validation
from utilities import load_data, save_data

In [2]:
# Data manipulation and analysis
import pandas as pd

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Machine learning models (regression)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Ensemble methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
file_name = "sp500_adj_close_raw_with_nas"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96mFile `test_w_na.csv.bz2` loaded from `sp500_adj_close_raw_with_nas.zip`[0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


### Data Pre-Processing
___

#### Split todays data (For prediction) and historical data (For training)

In [4]:
todays_date = "2024-10-25"

historical_data, todays_data = split_dataset_by_date(raw_data, todays_date)

print("Todays Date:", todays_date)

Todays Date: 2024-10-25


#### Ensure there are missing values on Todays Data is what we are predicting...

In [5]:
todays_data.isnull().sum()

Date                    0
Ticker                  0
Adjusted Close          0
Next Day Close        501
Previous Day Close      0
Return                  0
Volatility              0
RSI                     0
SMA_50                  0
SMA_100                 0
SMA_200                 0
Upper Band              0
Lower Band              0
Support                 0
Resistance              0
Action                501
dtype: int64

#### Handle missing values (NA's) on the historical data used to train and test...

In [6]:
historical_data = clean_historical_data(historical_data)

historical_data.isnull().sum()

Date                  0
Ticker                0
Adjusted Close        0
Next Day Close        0
Previous Day Close    0
Return                0
Volatility            0
RSI                   0
SMA_50                0
SMA_100               0
SMA_200               0
Upper Band            0
Lower Band            0
Support               0
Resistance            0
Action                0
dtype: int64

In [7]:
tickers_no_missing_values, tickers_with_missing_values = check_tickers_for_missing_values(historical_data)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m          Tickers that do not have any missing values          [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of unique tickers:      |          501.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with no m... |          501.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with miss... |           0.00000           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


### Exploratory Data Analysis (EDA):
___

#### Todays Data:

In [8]:
print("Today's Data Shape:", todays_data.shape)

Today's Data Shape: (501, 16)


#### Historical Data:

In [9]:
print("Historical Data Shape:", historical_data.shape)

Historical Data Shape: (1978479, 16)


### Create Multiple Versions of Dataset
___

#### Select which version of the data to work with

In [10]:
# Data with dates and without tickers (Set as index for reference)
def prepare_data_v2(main_data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = main_data.copy().reset_index(drop=True)
    
    # Convert the `Date` column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from the `Date` column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Set the index to `Date` and `Ticker`
    df = df.set_index(["Date", "Ticker"])
    
    return df

main_data = prepare_data_v2(historical_data.copy().reset_index(drop=True))

##### **Note: It is recommended to remove `["Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]` after VIF inspection...**

##### **Note: It is recommended to remove `["Day"]` after p-value inspection...**

In [11]:
select_data = main_data.copy()

select_columns_to_drop = ["Action", "Previous Day Close", "SMA_50", "Resistance", "Upper Band", "SMA_200", "Day"]

data = select_data.drop(columns=select_columns_to_drop)

print("Shape:", data.shape)
data.head()

Shape: (1978479, 10)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-01-02,A,23.256388,23.025743,-0.011976,0.015704,48.82759,23.299887,22.540236,21.392029,2008,1
2008-01-02,AAPL,5.876342,5.879055,-0.016357,0.018937,59.06735,4.939064,5.403559,4.637376,2008,1
2008-01-02,ABT,18.130209,18.019754,-0.006055,0.010484,34.677372,17.62825,18.221804,16.775562,2008,1
2008-01-02,ACGL,7.608889,7.764444,-0.026581,0.016022,45.15419,7.878933,7.378535,7.463333,2008,1
2008-01-02,ACN,26.437077,25.982521,0.000833,0.024039,54.812176,27.78442,24.273775,24.765512,2008,1


### Split data features `X` and target `y`
___

In [12]:
# Split the data into features (X) and target (y)
X = data.drop(columns="Next Day Close")

y = data["Next Day Close"]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1978479, 9)
Shape of y: (1978479,)


### Data Splitting
___

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1582783, 9)
Shape of X_test: (395696, 9)
Shape of y_train: (1582783,)
Shape of y_test: (395696,)


### Scaling the Data
___

#### Scale the data using `StandardScaler`

In [14]:
# Scale using StandardScaler
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train_scaled shape:", y_train_scaled.shape)
print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1582783, 9)
X_test_scaled shape: (395696, 9)
y_train_scaled shape: (1582783, 1)
y_test_scaled shape: (395696, 1)


### Model Training
___


In [15]:
from sklearn.model_selection import GridSearchCV

# XGBoost Regressor
xgb_model = XGBRegressor(
    tree_method="hist", # Use GPU for training
    device="cuda", # Use GPU for training
    random_state=42 
)

xgb_param_grid = {
    "colsample_bytree": [0.5, 0.7], # Subsample ratio of columns when constructing each tree
    "gamma": [0, 0.1], # Minimum loss reduction required to make a further partition on a leaf node of the tree
    "learning_rate": [0.01, 0.1], # Step size shrinkage used to prevent overfitting
    "max_depth": [3, 5], # Maximum depth of the tree
    "n_estimators": [100, 200], # Number of trees
    "subsample": [0.5, 0.7], # Subsample ratio of the training instances
}

grid_search_xgb = GridSearchCV(
    estimator=xgb_model, # XGBoost Regressor
    param_grid=xgb_param_grid, # Parameter grid
    cv=3, # Number of folds
    scoring="neg_mean_absolute_error", # Scoring metric
    # n_jobs=-1, # Use all available processors
    verbose=2 # Print detailed information
)

grid_search_xgb.fit(X_train_scaled, y_train_scaled)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   1.0s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.7s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.7s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.5; total time=   1.2s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.5; total time=   1.2s


In [16]:
import json

print(json.dumps(grid_search_xgb.best_params_, indent=4))

{
    "colsample_bytree": 0.7,
    "gamma": 0,
    "learning_rate": 0.1,
    "max_depth": 5,
    "n_estimators": 200,
    "subsample": 0.7
}


### Model Evaluation
___

In [17]:
best_xgb_model = grid_search_xgb.best_estimator_

In [20]:
# Predict the target values using the testing data
y_predict_scaled = best_xgb_model.predict(X_test_scaled)
y_train_predict_scaled = best_xgb_model.predict(X_train_scaled)

# Unscale the testing data and predictions
y_predict_unscaled = y_scaler.inverse_transform(y_predict_scaled.reshape(-1, 1))
y_test_unscaled = y_scaler.inverse_transform(y_test_scaled)

# Unscale the training data and predictions
y_train_predict_unscaled = y_scaler.inverse_transform(y_train_predict_scaled.reshape(-1, 1))
y_train_unscaled = y_scaler.inverse_transform(y_train_scaled)

In [21]:
evaluate_regression_model(
    best_xgb_model,
    "XGBoost Regressor",
    X_train_scaled,
    y_train_scaled,
    X_test_scaled,
    y_test_scaled,
    y_train_unscaled,
    y_test_unscaled,
    y_train_predict_unscaled,
    y_predict_unscaled
)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[96m              XGBoost Regressor Model Evaluation               [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97m                               |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[33mTraining Data Metrics          |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mMean Squared Error (Train):    |         4,161.41641         [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mRoot Mean Squared Error (Tr... |           64.50904          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mR-Squared (Train):             |           0.89611           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mAdjusted R-Squared (Train):    |           0.89611           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

In [22]:
cv_scores = cross_val_score(
    best_xgb_model,
    X_train_scaled,
    y_train_scaled,
    cv=5,
    scoring="r2"
)

In [23]:
evaluate_cross_validation(cv_scores, "XGBoost Regressor")

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[92m           XGBoost Regressor Cross Validation Scores           [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 1:                        |           0.85087           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 2:                        |           0.86078           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 3:                        |           0.86427           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 4:                        |           0.84942           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 5:                        |           0.85785           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97m                               |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

In [25]:
import pickle

# Save model path
model_name = "XGBRegressor.pkl"
model_path = f"../../../models/{model_name}"

# Save the model to a file
with open(model_path, "wb") as file:
    pickle.dump(best_xgb_model, file)

print(f"Model saved to {model_path}")

Model saved to ../../../models/XGBRegressor.pkl


### Predict tomorrows `Adjusted Close`
___
___
___

In [26]:
def preprocess_todays_data(df, columns_to_drop):
    # Convert 'Date' column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from 'Date' column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Drop specified columns
    df = df.drop(columns=columns_to_drop)
    
    # Set 'Date' and 'Ticker' as the index
    df = df.set_index(["Date", "Ticker"])
    
    # Drop 'Next Day Close' column
    df = df.drop(columns="Next Day Close")
    
    return df

X_to_predict = todays_data.copy()

X_to_predict = preprocess_todays_data(X_to_predict, select_columns_to_drop)

X_to_predict_scaled = X_scaler.transform(X_to_predict)

print("Shape of todays_data_clean_scaled:", X_to_predict_scaled.shape)

Shape of todays_data_clean_scaled: (501, 9)


In [27]:
y_to_predict_scaled = best_xgb_model.predict(X_to_predict_scaled)

y_to_predict_unscaled = y_scaler.inverse_transform(y_to_predict_scaled.reshape(-1, 1))

print("Shape of y_to_predict_unscaled:", y_to_predict_unscaled.shape)

Shape of y_to_predict_unscaled: (501, 1)


In [28]:
prediction_df = todays_data.copy()

prediction_df["Next Day Close"] = y_to_predict_unscaled.round(3)

prediction_df = prediction_df[["Date", "Ticker", "Adjusted Close", "Next Day Close"]]

prediction_df = prediction_df.set_index(["Date", "Ticker"])

print("Shape:", prediction_df.shape)
display(prediction_df.head(20))
display(prediction_df.tail(20))

Shape: (501, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-25,A,130.16,130.714996
2024-10-25,AAPL,231.4,234.897003
2024-10-25,ABBV,187.82,186.307999
2024-10-25,ABNB,134.6,134.709
2024-10-25,ABT,114.24,114.432999
2024-10-25,ACGL,105.28,105.343002
2024-10-25,ACN,360.79,358.028015
2024-10-25,ADBE,483.73,510.875
2024-10-25,ADI,230.17,225.906998
2024-10-25,ADM,56.57,56.719002


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Next Day Close
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-25,WBD,7.59,7.527
2024-10-25,WDC,69.455,68.667
2024-10-25,WEC,97.17,96.750999
2024-10-25,WELL,130.19,129.876999
2024-10-25,WFC,64.52,64.546997
2024-10-25,WM,206.8,208.123001
2024-10-25,WMB,52.51,53.423
2024-10-25,WMT,82.51,83.210999
2024-10-25,WRB,58.17,58.603001
2024-10-25,WST,307.92,307.705994


In [29]:
# Make index column a regular column
prediction_df.reset_index(inplace=True)

prediction_df

Unnamed: 0,Date,Ticker,Adjusted Close,Next Day Close
0,2024-10-25,A,130.16,130.714996
1,2024-10-25,AAPL,231.40,234.897003
2,2024-10-25,ABBV,187.82,186.307999
3,2024-10-25,ABNB,134.60,134.709000
4,2024-10-25,ABT,114.24,114.432999
...,...,...,...,...
496,2024-10-25,XYL,130.41,130.182999
497,2024-10-25,YUM,133.04,132.483994
498,2024-10-25,ZBH,102.33,102.853996
499,2024-10-25,ZBRA,359.97,356.968994


In [30]:
file_name = "XGB_predict.zip"
file_path = f"../../../data/raw_data/{file_name}"

save_data(prediction_df, file_path)

[1m[35m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[35m║[0m[1m[95m   File `XGB_predict.zip` already exists. Overwriting file.    [0m[1m[35m║[0m
[1m[35m╚═══════════════════════════════════════════════════════════════╝[0m
[1m[32m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[32m║[0m[1m[92m          File saved and zipped as `XGB_predict.zip`           [0m[1m[32m║[0m
[1m[32m╚═══════════════════════════════════════════════════════════════╝[0m
