## Import the required packages to extract, clean and transform the data

In [1]:
# Import in-house modules from the 'utilities' package
from utilities import print_title, print_label
from utilities import save_data, load_data
from utilities import fetch_and_download_sp500_data, sp500_data_for_today
from utilities import split_dataset_by_date, clean_historical_data
from utilities import calculate_bollinger_bands, calculate_rsi, calculate_daily_volatility
from utilities import generate_trading_signals
from utilities import generate_directions

# Import datetime module
from datetime import datetime, timedelta

# Import libraries for data analysis and visualization
import numpy as np
import pandas as pd
import joblib
import pickle

## Fetch the SP500 data using a helper functions

In [2]:
historical_data = fetch_and_download_sp500_data(start_date="2007-01-01")
today_data      = sp500_data_for_today()

[*********************100%***********************]  501 of 501 completed


[1m[34m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[34m║[0m[1m[94m            Download Report for S&P 500 `adj close`            [0m[1m[34m║[0m
[1m[34m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[34m║ [0m[1m[94mTotal Requested Tickers:       |             501             [0m[1m[34m ║[0m[0m
[1m[97m[1m[34m║ [0m[1m[93mTotal Downloaded Tickers:      |             501             [0m[1m[34m ║[0m[0m
[1m[97m[1m[34m║ [0m[1m[94mRequested Date Range:          |   2007-01-01 to 2024-10-30  [0m[1m[34m ║[0m[0m
[1m[97m[1m[34m║ [0m[1m[93mDownloaded Date Range:         |   2007-01-03 to 2024-10-29  [0m[1m[34m ║[0m[0m
[1m[34m║ [0m[1m[90m[3m                      S&P 500 Data Downloaded Successfully...[0m[0m[1m[34m ║[0m
[1m[34m╚═══════════════════════════════════════════════════════════════╝[0m
[1m[34m╔═══════════════════════════════════════════════════════════════

## Combine the data into a single dataset

In [3]:
df = pd.concat([historical_data, today_data], axis=0)

## Create DataFrame with features

In [4]:
upper_band,        lower_band         = calculate_bollinger_bands(df)
today_to_tomorrow, yesterday_to_today = generate_directions(df)

featured_df = pd.DataFrame({
  "Date":               np.repeat(df.index, len(df.columns)),
  "Ticker":             np.tile(df.columns, len(df)),
  "Adjusted Close":     df.values.flatten(),
  "Today to Tomorrow":  today_to_tomorrow.values.flatten(),
  "Yesterday to Today": yesterday_to_today.values.flatten(),
  "Next Day Close":     df.shift(-1).values.flatten(),
  "Previous Day Close": df.shift(1).values.flatten(),
  "Return":             df.pct_change().values.flatten(),
  "Volatility":         df.apply(calculate_daily_volatility).values.flatten(),
  "RSI":                df.apply(calculate_rsi).values.flatten(),
  "SMA_50":             df.rolling(window=50).mean().values.flatten(),
  "SMA_100":            df.rolling(window=100).mean().values.flatten(),
  "SMA_200":            df.rolling(window=200).mean().values.flatten(),
  "Upper Band":         upper_band.values.flatten(),
  "Lower Band":         lower_band.values.flatten(),
  "Support":            df.rolling(window=50).min().values.flatten(),
  "Resistance":         df.rolling(window=50).max().values.flatten(),
  "Action":             df.apply(generate_trading_signals).values.flatten()
})

# Replace inf values with NaN using an assignment
featured_df['Return'] = featured_df['Return'].replace([np.inf, -np.inf], np.nan)

# Convert to datetime (Without timezone)
featured_df['Date'] = pd.to_datetime(featured_df['Date'], utc=True).dt.date
featured_df['Date'] = pd.to_datetime(featured_df['Date'])

# Optimize the DataFrame by converting data types to more memory-efficient types
featured_df[['Ticker', 'Action']] = featured_df[['Ticker', 'Action']].astype('category')
float_cols = featured_df.select_dtypes(include=['float64']).columns
featured_df[float_cols] = featured_df[float_cols].astype('float32')

## Filter out backfilled data

In [5]:
# Filter rows from `2007-01-01` to `2007-12-31`
remove_dates_filter = (featured_df['Date'] >= "2007-01-01") & (featured_df['Date'] <= "2007-12-31")

# Remove the dates from the DataFrame
featured_df = featured_df.loc[~remove_dates_filter].copy()

## Split the data into training and testing data using a helper function
- We split the data on the previous date because the current date may not be available.

In [6]:
def split_dataset_by_date(raw_data: pd.DataFrame, todays_date: str) -> tuple:
    """
    Split the dataset into historical data and today's data based on the given date.

    Parameters:
    raw_data (pd.DataFrame): The raw data containing a 'Date' column.
    todays_date (str): The date to filter today's data.

    Returns:
    tuple: A tuple containing the historical data and today's data.
    """
    start_date = "2024-10-25"
    end_date = "2024-10-25"

    filter_data_by_date = (raw_data["Date"] >= start_date) & (raw_data["Date"] <= end_date)
    
    return raw_data[~filter_data_by_date], raw_data[filter_data_by_date]

In [7]:
today      = datetime.now().date().strftime('%Y-%m-%d')
yesterday  = (datetime.now().date() - timedelta(days=1)).strftime('%Y-%m-%d')
split_date = today if featured_df["Date"].eq(today).any() else yesterday

historical_data, current_data = split_dataset_by_date(featured_df, split_date)

## Clean the data using a helper function

In [8]:
historical_data = clean_historical_data(historical_data)

## Prepare the prediction DataFrame
- Update the Date column to a datetime format
- Add Year, Month, Day, columns
- Set the index by the Date and Ticker columns

In [9]:
prediction_df = current_data.reset_index(drop=True)
prediction_df["Date"]  = pd.to_datetime(prediction_df["Date"])
prediction_df["Year"]  = prediction_df["Date"].dt.year
prediction_df["Month"] = prediction_df["Date"].dt.month
prediction_df["Day"]   = prediction_df["Date"].dt.day
prediction_df.set_index(["Date", "Ticker"], inplace=True)

## Drop low relevance columns determined by VIF and P-value analysis

In [10]:
select_target_to_drop   = ["Today to Tomorrow"]
select_features_to_drop = ["Action", "Next Day Close", "Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]
log_select_to_drop      = ["Action", "Next Day Close", "Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200",
                           "Next Day Close", "Adjusted Close", "Support", "Lower Band", "SMA_100"]

predict_data_clean     = prediction_df.drop(columns=[*select_features_to_drop, *select_target_to_drop])
log_predict_data_clean = prediction_df.drop(columns=[*log_select_to_drop, *select_target_to_drop])

## Import pre-trained scaling models and apply them to the data

In [11]:
X_scaler     = joblib.load("models/christian's_models/X_scaler.pkl")
X_scaler_log = joblib.load("models/christian's_models/X_scaler_log.pkl")
X_scaled     = X_scaler.transform(predict_data_clean)
X_scaled_log = X_scaler_log.transform(log_predict_data_clean)

## Import pre-trained prediction models

In [12]:
xbg_model           = pickle.load(open("models/christian's_models/clf_XGB_v2.pkl", "rb"))
random_forest_model = pickle.load(open("models/christian's_models/random_forest_classifier_v2.pkl", "rb"))
logistic_model      = pickle.load(open("models/christian's_models/logistic_regression_v1.pkl", "rb"))

## Predict the data using the pre-trained models

In [13]:
y_XGB = xbg_model.predict(X_scaled)
y_RFC = random_forest_model.predict(X_scaled)
y_LR  = logistic_model.predict(X_scaled_log)

# Change values where `0` is to `-1`
y_XGB[y_XGB == 0] = -1
y_RFC[y_RFC == 0] = -1
y_LR[y_LR == 0] = -1

y_XGB[:5], y_RFC[:5], y_LR[:5]

(array([1, 1, 1, 1, 1]), array([1, 1, 1, 1, 1]), array([1., 1., 1., 1., 1.]))

## Set up the output DataFrame

In [14]:
todays_data_predict = current_data[["Date", "Ticker", "Adjusted Close", "Today to Tomorrow"]].copy()

# Change to integer
todays_data_predict["XGB Today to Tomorrow"] = y_XGB
todays_data_predict["RanFC Today to Tomorrow"] = y_RFC
todays_data_predict["Log_R Today to Tomorrow"] = y_LR.astype(int)

# Print the unique values of the predictions
print(todays_data_predict["XGB Today to Tomorrow"].unique())
print(todays_data_predict["RanFC Today to Tomorrow"].unique())
print(todays_data_predict["Log_R Today to Tomorrow"].unique())

todays_data_predict.head(50)

[ 1 -1]
[ 1 -1]
[ 1 -1]


Unnamed: 0,Date,Ticker,Adjusted Close,Today to Tomorrow,XGB Today to Tomorrow,RanFC Today to Tomorrow,Log_R Today to Tomorrow
2246484,2024-10-25,A,130.190002,1.0,1,1,1
2246485,2024-10-25,AAPL,231.410004,1.0,1,1,1
2246486,2024-10-25,ABBV,187.850006,1.0,1,1,1
2246487,2024-10-25,ABNB,134.580002,1.0,1,1,1
2246488,2024-10-25,ABT,114.220001,-1.0,1,1,1
2246489,2024-10-25,ACGL,105.300003,1.0,1,1,1
2246490,2024-10-25,ACN,360.799988,1.0,1,1,1
2246491,2024-10-25,ADBE,483.720001,-1.0,1,1,1
2246492,2024-10-25,ADI,230.169998,-1.0,1,1,1
2246493,2024-10-25,ADM,56.560001,1.0,1,1,1


In [22]:
from sklearn.metrics import confusion_matrix

XGB_confusion = confusion_matrix(
    todays_data_predict["Today to Tomorrow"], todays_data_predict["XGB Today to Tomorrow"])

RFC_confusion = confusion_matrix(
    todays_data_predict["Today to Tomorrow"], todays_data_predict["RanFC Today to Tomorrow"])

LR_confusion = confusion_matrix(
    todays_data_predict["Today to Tomorrow"], todays_data_predict["Log_R Today to Tomorrow"])

In [23]:
df_confusion_XGB = pd.DataFrame(
    XGB_confusion,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

df_confusion_RFC = pd.DataFrame(
    RFC_confusion,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

df_confusion_LR = pd.DataFrame(
    LR_confusion,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

print_title("Confusion Matrix for XGB Model")
display(df_confusion_XGB)

print_title("Confusion Matrix for Random Forest Model")
display(df_confusion_RFC)

print_title("Confusion Matrix for Logistic Regression Model")
display(df_confusion_LR)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m                Confusion Matrix for XGB Model                 [0m[1m[90m║[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


Unnamed: 0,Predicted -1,Predicted 1
Actual -1,1,146
Actual 1,0,354


[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m           Confusion Matrix for Random Forest Model            [0m[1m[90m║[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


Unnamed: 0,Predicted -1,Predicted 1
Actual -1,10,137
Actual 1,11,343


[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m        Confusion Matrix for Logistic Regression Model         [0m[1m[90m║[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


Unnamed: 0,Predicted -1,Predicted 1
Actual -1,3,144
Actual 1,2,352
