## Import the required packages to extract, clean and transform the data

In [1]:
# Import in-house modules from the 'utilities' package
from utilities import print_title, print_label
from utilities import save_data, load_data
from utilities import fetch_and_download_sp500_data, sp500_data_for_today
from utilities import split_dataset_by_date, clean_historical_data
from utilities import calculate_bollinger_bands, calculate_rsi, calculate_daily_volatility
from utilities import generate_trading_signals
from utilities import generate_directions

# Import datetime module
from datetime import datetime, timedelta

# Import libraries for data analysis and visualization
import numpy as np
import pandas as pd
import joblib
import pickle

## Fetch the SP500 data using a helper functions

In [None]:
# historical_data = fetch_and_download_sp500_data(start_date="2007-01-01")
# today_data      = sp500_data_for_today()

[*********************100%***********************]  501 of 501 completed


[1m[34m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[34m║[0m[1m[94m            Download Report for S&P 500 `adj close`            [0m[1m[34m║[0m
[1m[34m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[34m║ [0m[1m[94mTotal Requested Tickers:       |             501             [0m[1m[34m ║[0m[0m
[1m[97m[1m[34m║ [0m[1m[93mTotal Downloaded Tickers:      |             501             [0m[1m[34m ║[0m[0m
[1m[97m[1m[34m║ [0m[1m[94mRequested Date Range:          |   2007-01-01 to 2024-10-30  [0m[1m[34m ║[0m[0m
[1m[97m[1m[34m║ [0m[1m[93mDownloaded Date Range:         |   2007-01-03 to 2024-10-29  [0m[1m[34m ║[0m[0m
[1m[34m║ [0m[1m[90m[3m                      S&P 500 Data Downloaded Successfully...[0m[0m[1m[34m ║[0m
[1m[34m╚═══════════════════════════════════════════════════════════════╝[0m
[1m[34m╔═══════════════════════════════════════════════════════════════

## Combine the data into a single dataset

In [None]:
# df = pd.concat([historical_data, today_data], axis=0)

## Create DataFrame with features

In [None]:
# upper_band,        lower_band         = calculate_bollinger_bands(df)
# today_to_tomorrow, yesterday_to_today = generate_directions(df)

# featured_df = pd.DataFrame({
#   "Date":               np.repeat(df.index, len(df.columns)),
#   "Ticker":             np.tile(df.columns, len(df)),
#   "Adjusted Close":     df.values.flatten(),
#   "Today to Tomorrow":  today_to_tomorrow.values.flatten(),
#   "Yesterday to Today": yesterday_to_today.values.flatten(),
#   "Next Day Close":     df.shift(-1).values.flatten(),
#   "Previous Day Close": df.shift(1).values.flatten(),
#   "Return":             df.pct_change().values.flatten(),
#   "Volatility":         df.apply(calculate_daily_volatility).values.flatten(),
#   "RSI":                df.apply(calculate_rsi).values.flatten(),
#   "SMA_50":             df.rolling(window=50).mean().values.flatten(),
#   "SMA_100":            df.rolling(window=100).mean().values.flatten(),
#   "SMA_200":            df.rolling(window=200).mean().values.flatten(),
#   "Upper Band":         upper_band.values.flatten(),
#   "Lower Band":         lower_band.values.flatten(),
#   "Support":            df.rolling(window=50).min().values.flatten(),
#   "Resistance":         df.rolling(window=50).max().values.flatten(),
#   "Action":             df.apply(generate_trading_signals).values.flatten()
# })

# # Replace inf values with NaN using an assignment
# featured_df['Return'] = featured_df['Return'].replace([np.inf, -np.inf], np.nan)

# # Convert to datetime (Without timezone)
# featured_df['Date'] = pd.to_datetime(featured_df['Date'], utc=True).dt.date
# featured_df['Date'] = pd.to_datetime(featured_df['Date'])

# # Optimize the DataFrame by converting data types to more memory-efficient types
# featured_df[['Ticker', 'Action']] = featured_df[['Ticker', 'Action']].astype('category')
# float_cols = featured_df.select_dtypes(include=['float64']).columns
# featured_df[float_cols] = featured_df[float_cols].astype('float32')

## Filter out backfilled data

In [None]:
# # Filter rows from `2007-01-01` to `2007-12-31`
# remove_dates_filter = (featured_df['Date'] >= "2007-01-01") & (featured_df['Date'] <= "2007-12-31")

# # Remove the dates from the DataFrame
# featured_df = featured_df.loc[~remove_dates_filter].copy()

## Split the data into training and testing data using a helper function
- We split the data on the previous date because the current date may not be available.

In [None]:
# today      = datetime.now().date().strftime('%Y-%m-%d')
# yesterday  = (datetime.now().date() - timedelta(days=1)).strftime('%Y-%m-%d')
# split_date = today if featured_df["Date"].eq(today).any() else yesterday

# historical_data, current_data = split_dataset_by_date(featured_df, split_date)

## Clean the data using a helper function

In [None]:
# historical_data = clean_historical_data(historical_data)

In [30]:
from utilities import load_data, save_data

In [33]:
file_name = "sp500_updated_adj_close_with_nas"
file_path = f"data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96mFile `sp500_updated_adj_close_with_nas.csv.bz2` loaded from `sp500_updated_adj_close_with_nas.zip`[0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


In [35]:
start_date = "2024-10-28"
end_date = "2024-10-28"

todays_data = split_dataset_by_date(raw_data, start_date, end_date)

print("Start date:", todays_data["Date"].min())
print("End date:", todays_data["Date"].max())
print("Requested start date:", start_date)
print("Requested end date:", end_date)

Start date: 2024-10-28
End date: 2024-10-28
Requested start date: 2024-10-28
Requested end date: 2024-10-28


In [36]:
def prepare_data_v2(main_data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = main_data.copy().reset_index(drop=True)
    
    # Convert the `Date` column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from the `Date` column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Set the index to `Date` and `Ticker`
    df = df.set_index(["Date", "Ticker"])
    
    return df

In [37]:
predict_data_raw = todays_data.copy()

predict_data_raw = prepare_data_v2(predict_data_raw)

print("Shape:", predict_data_raw.shape)
predict_data_raw

Shape: (501, 19)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Yesterday to Today,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-10-28,A,131.54,-1.0,1.0,131.26,130.19,0.010369,0.012224,21.147879,140.16748,136.64005,137.96117,151.51090,128.72310,130.1900,148.24400,sell,2024,10,28
2024-10-28,AAPL,233.40,1.0,1.0,233.75,231.41,0.008599,0.013773,63.837505,226.94860,222.17435,201.01146,237.86389,222.29712,216.3200,236.48000,hold,2024,10,28
2024-10-28,ABBV,189.68,-1.0,1.0,189.48,187.85,0.009742,0.008082,45.579517,192.65224,183.30026,174.40674,196.41519,185.95969,186.5400,197.77069,sell,2024,10,28
2024-10-28,ABNB,135.78,1.0,1.0,137.79,134.58,0.008917,0.014876,57.380733,125.33660,133.31970,143.59778,140.07527,125.49572,114.2800,137.19000,hold,2024,10,28
2024-10-28,ABT,114.07,-1.0,-1.0,113.44,114.22,-0.001313,0.010344,49.922375,114.22140,109.51645,109.52122,119.48556,111.08734,110.2504,119.39000,short,2024,10,28
2024-10-28,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-28,XYL,130.79,-1.0,1.0,129.92,130.42,0.002837,0.009297,39.808743,133.47752,134.33392,130.94939,138.42334,129.49466,126.7100,137.53000,sell,2024,10,28
2024-10-28,YUM,134.86,-1.0,1.0,134.09,133.04,0.013680,0.006889,50.717210,134.57391,133.39450,134.04712,138.69244,130.99457,129.7100,139.92000,sell,2024,10,28
2024-10-28,ZBH,103.60,1.0,1.0,103.88,102.35,0.012213,0.010964,56.833466,107.23039,107.88450,115.28450,107.08596,101.51304,101.7700,115.91237,hold,2024,10,28
2024-10-28,ZBRA,363.58,1.0,1.0,384.50,360.09,0.009692,0.009976,43.572613,356.28200,337.45250,311.58470,380.04346,359.12952,320.7700,377.68000,hold,2024,10,28


## Prepare the prediction DataFrame
- Update the Date column to a datetime format
- Add Year, Month, Day, columns
- Set the index by the Date and Ticker columns

## Drop low relevance columns determined by VIF and P-value analysis

In [54]:
select_target_to_drop   = ["Today to Tomorrow"]
select_features_to_drop = ["Action", "Next Day Close", "Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]
log_select_to_drop      = ["Action", "Next Day Close", "Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200",
                           "Next Day Close", "Adjusted Close", "Support", "Lower Band", "SMA_100"]


prediction_df = predict_data_raw.copy()
predict_data_clean     = prediction_df.drop(columns=[*select_features_to_drop, *select_target_to_drop])
log_predict_data_clean = prediction_df.drop(columns=[*log_select_to_drop, *select_target_to_drop])

In [55]:
predict_data_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Yesterday to Today,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2024-10-28,A,131.54,1.0,0.010369,0.012224,21.147879,136.64005,128.72310,130.1900,2024,10,28
2024-10-28,AAPL,233.40,1.0,0.008599,0.013773,63.837505,222.17435,222.29712,216.3200,2024,10,28
2024-10-28,ABBV,189.68,1.0,0.009742,0.008082,45.579517,183.30026,185.95969,186.5400,2024,10,28
2024-10-28,ABNB,135.78,1.0,0.008917,0.014876,57.380733,133.31970,125.49572,114.2800,2024,10,28
2024-10-28,ABT,114.07,-1.0,-0.001313,0.010344,49.922375,109.51645,111.08734,110.2504,2024,10,28
2024-10-28,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-28,XYL,130.79,1.0,0.002837,0.009297,39.808743,134.33392,129.49466,126.7100,2024,10,28
2024-10-28,YUM,134.86,1.0,0.013680,0.006889,50.717210,133.39450,130.99457,129.7100,2024,10,28
2024-10-28,ZBH,103.60,1.0,0.012213,0.010964,56.833466,107.88450,101.51304,101.7700,2024,10,28
2024-10-28,ZBRA,363.58,1.0,0.009692,0.009976,43.572613,337.45250,359.12952,320.7700,2024,10,28


In [51]:
def clean_historical_data(historical_data: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the historical data by removing rows with specific conditions and backfilling missing values.

    Parameters:
    historical_data (pd.DataFrame): The historical data to be cleaned.

    Returns:
    pd.DataFrame: The cleaned historical data.
    """
    # Remove all rows where `Adjusted Close` is 0
    historical_data = historical_data[historical_data["Adjusted Close"] != 0].reset_index(drop=True)
    
    # Remove all rows where `Volatility` is NaN
    historical_data = historical_data.dropna(subset=["Volatility"]).reset_index(drop=True)
    
    # Backfill the `RSI` column
    historical_data["RSI"] = historical_data["RSI"].bfill()
    
    # Backfill the `Action` column
    historical_data["Action"] = historical_data["Action"].bfill()
    
    return historical_data

In [57]:
log_predict_data_clean.isnull().sum()

Yesterday to Today    0
Return                1
Volatility            1
RSI                   1
Year                  0
Month                 0
Day                   0
dtype: int64

In [58]:
log_predict_data_clean["Return"].bfill(inplace=True)
log_predict_data_clean["Volatility"].bfill(inplace=True)
log_predict_data_clean["RSI"].bfill(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  log_predict_data_clean["Return"].bfill(inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  log_predict_data_clean["Volatility"].bfill(inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [59]:
log_predict_data_clean.isnull().sum()

Yesterday to Today    0
Return                0
Volatility            0
RSI                   0
Year                  0
Month                 0
Day                   0
dtype: int64

## Import pre-trained scaling models and apply them to the data

In [60]:
X_scaler     = joblib.load("models/christian's_models/X_scaler.pkl")
X_scaler_log = joblib.load("models/christian's_models/X_scaler_log.pkl")
X_scaled     = X_scaler.transform(predict_data_clean)
X_scaled_log = X_scaler_log.transform(log_predict_data_clean)

## Import pre-trained prediction models

In [61]:
xbg_model           = pickle.load(open("models/christian's_models/clf_XGB_v2.pkl", "rb"))
random_forest_model = pickle.load(open("models/christian's_models/random_forest_classifier_v2.pkl", "rb"))
logistic_model      = pickle.load(open("models/christian's_models/logistic_regression_v1.pkl", "rb"))

## Predict the data using the pre-trained models

In [62]:
y_XGB = xbg_model.predict(X_scaled)
y_RFC = random_forest_model.predict(X_scaled)
y_LR  = logistic_model.predict(X_scaled_log)

# Change values where `0` is to `-1`
y_XGB[y_XGB == 0] = -1
y_RFC[y_RFC == 0] = -1
y_LR[y_LR == 0] = -1

y_XGB[:5], y_RFC[:5], y_LR[:5]

(array([1, 1, 1, 1, 1]), array([1, 1, 1, 1, 1]), array([1., 1., 1., 1., 1.]))

## Set up the output DataFrame

In [63]:
todays_data_predict = current_data[["Date", "Ticker", "Adjusted Close", "Today to Tomorrow"]].copy()

# Change to integer
todays_data_predict["XGB Today to Tomorrow"] = y_XGB
todays_data_predict["RanFC Today to Tomorrow"] = y_RFC
todays_data_predict["Log_R Today to Tomorrow"] = y_LR.astype(int)

# Print the unique values of the predictions
print(todays_data_predict["XGB Today to Tomorrow"].unique())
print(todays_data_predict["RanFC Today to Tomorrow"].unique())
print(todays_data_predict["Log_R Today to Tomorrow"].unique())

todays_data_predict.head(50)

[ 1 -1]
[ 1 -1]
[ 1 -1]


Unnamed: 0,Date,Ticker,Adjusted Close,Today to Tomorrow,XGB Today to Tomorrow,RanFC Today to Tomorrow,Log_R Today to Tomorrow
0,2024-10-30,A,131.520004,,1,1,1
1,2024-10-30,AAPL,230.139999,,1,1,1
2,2024-10-30,ABBV,201.520004,,1,1,1
3,2024-10-30,ABNB,136.5,,1,1,1
4,2024-10-30,ABT,114.489998,,1,1,1
5,2024-10-30,ACGL,105.144997,,1,1,1
6,2024-10-30,ACN,346.600006,,1,1,1
7,2024-10-30,ADBE,486.899994,,1,1,1
8,2024-10-30,ADI,230.130005,,1,1,1
9,2024-10-30,ADM,55.57,,1,1,1
