In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import clean_historical_data, check_tickers_for_missing_values
from utilities import calc_vif, calc_p_values, calc_correlation, highlight_vif, highlight_p_values, evaluate_regression_model, evaluate_cross_validation, evaluate_classifier_model
from utilities import load_data, save_data

In [2]:
# Data manipulation and analysis
import pandas as pd

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [3]:
file_name = "sp500_updated_adj_close_with_nas"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96mFile `sp500_updated_adj_close_with_nas.csv.bz2` loaded from `sp500_updated_adj_close_with_nas.zip`[0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


#### Inspect Tickers Individually

In [4]:
# Inspect Cleaned data zeroing in on individual tickers
select_df = raw_data.copy()

select_ticker = select_df["Ticker"] == "AAPL"

select_df[select_ticker].tail(10)

Unnamed: 0,Date,Ticker,Adjusted Close,Today to Tomorrow,Yesterday to Today,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
2117227,2024-10-16,AAPL,231.78,1.0,-1.0,232.15,233.85,-0.008852,0.015279,55.54687,224.63441,218.88077,199.03271,233.71204,222.44496,209.57742,233.85,buy
2117728,2024-10-17,AAPL,232.15,1.0,1.0,235.0,231.78,0.001596,0.014905,55.662342,225.08586,219.30466,199.26877,234.15703,222.32797,213.06339,233.85,hold
2118229,2024-10-18,AAPL,235.0,1.0,1.0,236.48,232.15,0.012277,0.012875,52.767017,225.5246,219.75696,199.526,235.22388,221.94112,215.99,235.0,hold
2118730,2024-10-21,AAPL,236.48,-1.0,1.0,235.86,235.0,0.006298,0.01288,66.655846,225.9344,220.22105,199.80229,236.51566,221.65034,216.32,236.48,sell
2119231,2024-10-22,AAPL,235.86,-1.0,-1.0,230.76,236.48,-0.002622,0.012744,64.70208,226.301,220.66896,200.07912,237.4785,221.5365,216.32,236.48,short
2119732,2024-10-23,AAPL,230.76,-1.0,-1.0,230.57,235.86,-0.021623,0.013733,57.298534,226.4908,221.05629,200.30862,237.57504,221.87895,216.32,236.48,short
2120233,2024-10-24,AAPL,230.57,1.0,-1.0,231.41,230.76,-0.000823,0.013688,55.555565,226.6678,221.42393,200.53928,237.66525,222.09375,216.32,236.48,buy
2120734,2024-10-25,AAPL,231.41,1.0,1.0,233.4,230.57,0.003643,0.01367,66.38572,226.8016,221.79678,200.76889,237.80995,222.31105,216.32,236.48,hold
2121235,2024-10-28,AAPL,233.4,1.0,1.0,233.75,231.41,0.008599,0.013773,63.837505,226.9486,222.17435,201.01146,237.86389,222.29712,216.32,236.48,hold
2121736,2024-10-29,AAPL,233.75,,1.0,,233.4,0.0015,0.012854,58.716377,227.1058,222.56929,201.25412,238.18172,222.73328,216.32,236.48,


### Data Pre-Processing
___

In [5]:
# Remove where Today to Tomorrow is "O"
value_filter = raw_data["Today to Tomorrow"] == 0

raw_data = raw_data.loc[~value_filter]

raw_data["Today to Tomorrow"].unique()

array([-1.,  1., nan])

In [6]:
def split_dataset_by_date(raw_data: pd.DataFrame, split_date: str) -> tuple:
    # Filter data by date range
    filter_data_by_date = raw_data["Date"] < split_date
    
    # Create a new dataframe with today's data
    historical_data = raw_data[filter_data_by_date].reset_index(drop=True)
    
    # Create a new dataframe with historical data (excluding today's data)
    predict_data = raw_data[~filter_data_by_date].reset_index(drop=True)
    
    return historical_data, predict_data

#### Split todays data (For prediction) and historical data (For training)

In [7]:
# todays_date = "2024-10-25"
split_date = "2024-09-30"

historical_data, predict_data = split_dataset_by_date(raw_data, split_date)

print("Split Date:", split_date)
print("Historical Data Start and End Dates:", historical_data["Date"].min(), historical_data["Date"].max())
print("Predict Data Start and End Dates:", predict_data["Date"].min(), predict_data["Date"].max())

Split Date: 2024-09-30
Historical Data Start and End Dates: 2008-01-02 2024-09-27
Predict Data Start and End Dates: 2024-09-30 2024-10-29


#### Ensure Missing Values on Todays Data is what we are Predicting

In [8]:
predict_data = clean_historical_data(predict_data)

print("Start Date:", predict_data["Date"].min())
print("End Date:", predict_data["Date"].max())
predict_data.isnull().sum()

Start Date: 2024-09-30
End Date: 2024-10-29


Date                    0
Ticker                  0
Adjusted Close          0
Today to Tomorrow     500
Yesterday to Today      0
Next Day Close        500
Previous Day Close      0
Return                  0
Volatility              0
RSI                     0
SMA_50                  0
SMA_100                 0
SMA_200                 0
Upper Band              0
Lower Band              0
Support                 0
Resistance              0
Action                500
dtype: int64

#### Handle Missing Values (NA's)


In [9]:
historical_data = clean_historical_data(historical_data)

print("Start Date:", historical_data["Date"].min())
print("End Date:", historical_data["Date"].max())
historical_data.isnull().sum()

Start Date: 2008-01-02
End Date: 2024-09-27


Date                  0
Ticker                0
Adjusted Close        0
Today to Tomorrow     0
Yesterday to Today    0
Next Day Close        0
Previous Day Close    0
Return                0
Volatility            0
RSI                   0
SMA_50                0
SMA_100               0
SMA_200               0
Upper Band            0
Lower Band            0
Support               0
Resistance            0
Action                0
dtype: int64

In [10]:
tickers_no_missing_values, tickers_with_missing_values = check_tickers_for_missing_values(historical_data)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m          Tickers that do not have any missing values          [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of unique tickers:      |          499.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with no m... |          499.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with miss... |           0.00000           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


### Exploratory Data Analysis (EDA):
___

#### Predict Data:

In [11]:
print("Shape:", predict_data.shape)

Shape: (10941, 18)


#### Historical Data:

In [12]:
print("Shape:", historical_data.shape)

Shape: (1950560, 18)


### Create Multiple Versions of Dataset
___

In [13]:
# Data with dates and without tickers (Set as index for reference)
def prepare_data_v2(main_data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = main_data.copy().reset_index(drop=True)
    
    # Convert the `Date` column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from the `Date` column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Set the index to `Date` and `Ticker`
    df = df.set_index(["Date", "Ticker"])
    
    return df

main_data = prepare_data_v2(historical_data.copy().reset_index(drop=True))

#### Select which version of the data to work with

In [14]:
main_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Yesterday to Today,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2008-01-02,A,23.256384,-1.0,-1.0,23.025742,23.538282,-0.011976,0.015704,48.827510,23.314175,23.299887,23.564934,24.727250,22.540232,21.392033,24.351938,short,2008,1,2
2008-01-02,AAPL,5.876340,1.0,-1.0,5.879058,5.974059,-0.016357,0.018937,59.067307,5.518483,4.939064,4.197630,6.135835,5.403558,4.637376,6.026840,buy,2008,1,2
2008-01-02,ABT,18.130210,-1.0,-1.0,18.019747,18.240658,-0.006055,0.010484,34.677720,18.138460,17.628250,17.709028,19.233109,18.221806,16.775566,19.134014,short,2008,1,2
2008-01-02,ACGL,7.608889,1.0,-1.0,7.764444,7.816667,-0.026581,0.016022,45.154190,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778,buy,2008,1,2
2008-01-02,ACN,26.437069,-1.0,1.0,25.982530,26.415083,0.000832,0.024039,54.812275,26.577984,27.784420,28.471031,28.227203,24.273777,24.765514,29.215666,sell,2008,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-27,XYL,134.510000,1.0,1.0,135.030000,134.130000,0.002833,0.015274,70.569610,133.008790,135.843140,128.534210,137.803190,126.457810,125.805660,141.154860,hold,2024,9,27
2024-09-27,YUM,139.920000,-1.0,1.0,139.710000,138.070000,0.013399,0.012814,66.764175,133.453030,133.806440,133.268300,138.392100,128.823900,125.324680,139.920000,sell,2024,9,27
2024-09-27,ZBH,107.980000,-1.0,1.0,107.950000,107.471130,0.004735,0.020791,70.106766,109.309240,110.604160,116.869156,114.414790,100.352870,104.238320,115.912370,sell,2024,9,27
2024-09-27,ZBRA,368.600000,1.0,-1.0,370.320000,371.240000,-0.007111,0.018034,83.407295,339.336200,326.619300,300.337160,379.808500,314.348500,314.650000,371.240000,buy,2024,9,27


In [15]:
select_data = main_data.copy()

# select_columns_to_drop = ["Action", "Next Day Close"]
select_columns_to_drop = ["Action", "Next Day Close", "Previous Day Close", "SMA_50", "Resistance", "Upper Band", "SMA_200","Next Day Close", "Adjusted Close", "Support", "Lower Band", "SMA_100"]

data = select_data.drop(columns=select_columns_to_drop)

print("Shape:", data.shape)
data.head()

Shape: (1950560, 8)


Unnamed: 0_level_0,Unnamed: 1_level_0,Today to Tomorrow,Yesterday to Today,Return,Volatility,RSI,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2008-01-02,A,-1.0,-1.0,-0.011976,0.015704,48.82751,2008,1,2
2008-01-02,AAPL,1.0,-1.0,-0.016357,0.018937,59.067307,2008,1,2
2008-01-02,ABT,-1.0,-1.0,-0.006055,0.010484,34.67772,2008,1,2
2008-01-02,ACGL,1.0,-1.0,-0.026581,0.016022,45.15419,2008,1,2
2008-01-02,ACN,-1.0,1.0,0.000832,0.024039,54.812275,2008,1,2


### Split data features `X` and target `y`
___

In [16]:
target = "Today to Tomorrow"

# Split the data into features (X) and target (y)
X = data.drop(columns=target)

y = data[target]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1950560, 7)
Shape of y: (1950560,)


### Feature Engineering
___

#### Inspect Multicollinearity using VIF

In [17]:
# Perform correlation matrix of X
calc_correlation(X)

Unnamed: 0,Yesterday to Today,Return,Volatility,RSI,Year,Month,Day
Yesterday to Today,1.0,0.633165,-0.013552,0.202797,0.011005,0.000123,-0.00857
Return,0.633165,1.0,0.027243,0.215085,0.001794,0.00293,-0.000102
Volatility,-0.013552,0.027243,1.0,-0.096164,-0.137099,0.010338,-0.003634
RSI,0.202797,0.215085,-0.096164,1.0,0.011103,-0.003006,0.003873
Year,0.011005,0.001794,-0.137099,0.011103,1.0,-0.036902,0.000166
Month,0.000123,0.00293,0.010338,-0.003006,-0.036902,1.0,2.7e-05
Day,-0.00857,-0.000102,-0.003634,0.003873,0.000166,2.7e-05,1.0


##### **Note: It is recommended to remove `["Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]` after VIF inspection...**

In [18]:
# Perform Variance Inflation Factor (VIF) analysis
vif = calc_vif(X)

vif.style.apply(lambda x: highlight_vif(x, threshold=2000))

Unnamed: 0,VIF
Yesterday to Today,1.687847
Return,1.698659
Volatility,3.159457
Day,4.229707
Month,4.622204
RSI,11.871121
Year,21.632499


### Data Splitting
___

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1560448, 7)
Shape of X_test: (390112, 7)
Shape of y_train: (1560448,)
Shape of y_test: (390112,)


#### Inspect Probability Values `(p-values)`|

##### **Note: It is recommended to remove `["Day"]` after p-value inspection...**

In [20]:
# Check P-Values
p_values, ols_model = calc_p_values(X_train, y_train)

p_values.style.apply(highlight_p_values)

Unnamed: 0,p_value
Year,0.0
Volatility,0.0
RSI,0.0
Return,0.0
Yesterday to Today,0.0
Day,0.349688
Month,0.606815


### Model Training
___


#### Scale the data using `StandardScaler`

In [21]:
# Scale using StandardScaler
X_scaler = StandardScaler()
# y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
# print("y_train_scaled shape:", y_train_scaled.shape)
# print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1560448, 7)
X_test_scaled shape: (390112, 7)


In [22]:
y.value_counts()

Today to Tomorrow
 1.0    1021992
-1.0     928568
Name: count, dtype: int64

In [23]:
from sklearn.linear_model import LogisticRegression

# import grid search
from sklearn.model_selection import GridSearchCV

# Create a logistic regression model
log_reg_model = LogisticRegression()

# Create a dictionary of hyperparameters to search
param_grid = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}

# Create grid search
grid_search = GridSearchCV(
    log_reg_model,
    param_grid,
    verbose=3,
    cv=5
)

# Fit the model using the grid search estimator
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, penalty=l1, solver=liblinear;, score=0.526 total time=   1.0s
[CV 2/5] END C=0.001, penalty=l1, solver=liblinear;, score=0.526 total time=   1.1s
[CV 3/5] END C=0.001, penalty=l1, solver=liblinear;, score=0.526 total time=   1.1s
[CV 4/5] END C=0.001, penalty=l1, solver=liblinear;, score=0.526 total time=   0.9s
[CV 5/5] END C=0.001, penalty=l1, solver=liblinear;, score=0.526 total time=   0.8s
[CV 1/5] END C=0.001, penalty=l2, solver=liblinear;, score=0.527 total time=   1.1s
[CV 2/5] END C=0.001, penalty=l2, solver=liblinear;, score=0.526 total time=   1.1s
[CV 3/5] END C=0.001, penalty=l2, solver=liblinear;, score=0.527 total time=   1.1s
[CV 4/5] END C=0.001, penalty=l2, solver=liblinear;, score=0.526 total time=   1.1s
[CV 5/5] END C=0.001, penalty=l2, solver=liblinear;, score=0.526 total time=   1.1s
[CV 1/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.527 total time=   1.0s
[CV 2/5] END C=0

In [25]:
import json

print(json.dumps(grid_search.best_params_, indent=4))

{
    "C": 0.01,
    "penalty": "l1",
    "solver": "liblinear"
}


In [26]:
best_xgb_model = grid_search.best_estimator_

In [27]:
# Predict the target values using the testing data
y_test_predict = best_xgb_model.predict(X_test_scaled)
y_train_predict = best_xgb_model.predict(X_train_scaled)

In [28]:
confusion_train, confusion_test = evaluate_classifier_model(
    "Logistic Regression",
    y_train,
    y_test,
    y_train_predict,
    y_test_predict
)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[96m             Logistic Regression Model Evaluation              [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97m                               |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[33mTraining Data Metrics          |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mAccuracy (Train):              |           0.52640           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mPrecision (Train):             |           0.52509           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mRecall (Train):                |           0.52640           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mF1 Score (Train):              |           0.40131           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

In [29]:
import numpy as np

classes = np.unique(y_train)
print("Class order:", classes)

Class order: [-1.  1.]


In [30]:
df_confusion_train = pd.DataFrame(
    confusion_train,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

df_confusion_train

Unnamed: 0,Predicted -1,Predicted 1
Actual -1,38847,703670
Actual 1,35361,782570


In [31]:
df_confusion_test = pd.DataFrame(
    confusion_test,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

df_confusion_test

Unnamed: 0,Predicted -1,Predicted 1
Actual -1,9750,176301
Actual 1,8797,195264


In [32]:
cv_scores = cross_val_score(
    log_reg_model,
    X_train_scaled,
    y_train,  # Use the original y_train with values 1, -1, and 0
    scoring="accuracy",  # Use accuracy for classification
    cv=5
)

In [33]:
evaluate_cross_validation(cv_scores, "Logistic Regression")

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[92m          Logistic Regression Cross Validation Scores          [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 1:                        |           0.52651           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 2:                        |           0.52609           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 3:                        |           0.52676           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 4:                        |           0.52640           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 5:                        |           0.52612           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97m                               |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

In [34]:
import pickle

# Save model path
model_name = "logistic_regression_model.pkl"
model_path = f"../../../models/{model_name}"

# Save the model to a file
with open(model_path, "wb") as file:
    pickle.dump(log_reg_model, file)

print(f"Model saved to {model_path}")

Model saved to ../../../models/logistic_regression_model.pkl


### Predict `Adjusted Close`
___
___
___

In [35]:
def preprocess_predict_data(df, columns_to_drop):
    # Convert 'Date' column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from 'Date' column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Drop specified columns
    df = df.drop(columns=columns_to_drop)
    
    # Set 'Date' and 'Ticker' as the index
    df = df.set_index(["Date", "Ticker"])
    
    df = df.drop(columns="Today to Tomorrow")
    
    return df

In [36]:
# Make a copy of the data we will use to predict
X_to_predict = predict_data.copy()

# Preprocess the data for prediction
X_to_predict = preprocess_predict_data(X_to_predict, select_columns_to_drop)

# Scale the data using the StandardScaler
X_to_predict_scaled = X_scaler.transform(X_to_predict)

print("Shape of predict data:", X_to_predict_scaled.shape)
X_to_predict_scaled

Shape of predict data: (10941, 7)


array([[ 0.95662083,  0.09544006, -0.19318623, ...,  1.63587341,
         0.73919198,  1.63151544],
       [ 0.95662083,  0.98654678, -0.32567388, ...,  1.63587341,
         0.73919198,  1.63151544],
       [ 0.95662083,  0.5834852 , -0.60550521, ...,  1.63587341,
         0.73919198,  1.63151544],
       ...,
       [ 0.95662083,  0.08947738, -0.59636822, ...,  1.63587341,
         1.03280235,  1.51723724],
       [ 0.95662083,  2.52843059, -0.1947437 , ...,  1.63587341,
         1.03280235,  1.51723724],
       [-1.05228825, -0.39333869, -0.42524934, ...,  1.63587341,
         1.03280235,  1.51723724]])

In [40]:
# Predict the target values using the testing data
y_to_predict = best_xgb_model.predict(X_to_predict_scaled)

print("Shape of y_to_predict:", y_to_predict.shape)
y_to_predict[:5]

Shape of y_to_predict: (10941,)


array([1., 1., 1., 1., 1.])

In [41]:
# Create a copy of the predict data so we may add the predictions
prediction_df = predict_data.copy()

# Add the predictions to the predict data made by the model
prediction_df["Pred Today to Tomorrow"] = y_to_predict

# Make a column "Correct Prediction" to check if the prediction is correct by comparing the actual value and the predicted value
prediction_df["Is Correct Prediction"] = prediction_df["Today to Tomorrow"] == prediction_df["Pred Today to Tomorrow"]

# Select the columns to keep
prediction_df = prediction_df[
    [
        'Date',
        'Ticker',
        'Adjusted Close',
        'Today to Tomorrow',
        'Pred Today to Tomorrow',
        'Is Correct Prediction'
    ]
]

# Drop rows with missing values
prediction_df.dropna(inplace=True)

# Set the index to 'Date' and 'Ticker' to better group the data
prediction_df = prediction_df.set_index(["Date", "Ticker"])

print("Shape:", prediction_df.shape)
display(prediction_df.head())
display(prediction_df.tail())

Shape: (10441, 4)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Pred Today to Tomorrow,Is Correct Prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-30,A,148.244,-1.0,1.0,False
2024-09-30,AAPL,233.0,-1.0,1.0,False
2024-09-30,ABBV,195.9155,-1.0,1.0,False
2024-09-30,ABNB,126.81,-1.0,1.0,False
2024-09-30,ABT,113.475204,-1.0,1.0,False


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Pred Today to Tomorrow,Is Correct Prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-10-28,XYL,130.79,-1.0,1.0,False
2024-10-28,YUM,134.86,-1.0,1.0,False
2024-10-28,ZBH,103.6,1.0,1.0,True
2024-10-28,ZBRA,363.58,1.0,1.0,True
2024-10-28,ZTS,182.76,-1.0,1.0,False


In [42]:
prediction_df["Is Correct Prediction"].value_counts()

Is Correct Prediction
False    5296
True     5145
Name: count, dtype: int64

In [43]:
# Inspect Cleaned data zeroing in on individual tickers
select_df = prediction_df.copy().reset_index()

select_ticker = select_df["Ticker"] == "AAPL"

display(select_df[select_ticker].set_index(["Date", "Ticker"])["Is Correct Prediction"].value_counts())
select_df[select_ticker].set_index(["Date", "Ticker"]).tail()

Is Correct Prediction
True     12
False     9
Name: count, dtype: int64

Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Pred Today to Tomorrow,Is Correct Prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-10-22,AAPL,235.86,-1.0,1.0,False
2024-10-23,AAPL,230.76,-1.0,1.0,False
2024-10-24,AAPL,230.57,1.0,1.0,True
2024-10-25,AAPL,231.41,1.0,1.0,True
2024-10-28,AAPL,233.4,1.0,1.0,True


In [44]:
# Make index column a regular column
prediction_df.reset_index(inplace=True)

prediction_df

Unnamed: 0,Date,Ticker,Adjusted Close,Today to Tomorrow,Pred Today to Tomorrow,Is Correct Prediction
0,2024-09-30,A,148.244000,-1.0,1.0,False
1,2024-09-30,AAPL,233.000000,-1.0,1.0,False
2,2024-09-30,ABBV,195.915500,-1.0,1.0,False
3,2024-09-30,ABNB,126.810000,-1.0,1.0,False
4,2024-09-30,ABT,113.475204,-1.0,1.0,False
...,...,...,...,...,...,...
10436,2024-10-28,XYL,130.790000,-1.0,1.0,False
10437,2024-10-28,YUM,134.860000,-1.0,1.0,False
10438,2024-10-28,ZBH,103.600000,1.0,1.0,True
10439,2024-10-28,ZBRA,363.580000,1.0,1.0,True


In [45]:
file_name = "log_reg_predict.zip"
file_path = f"../../../data/raw_data/{file_name}"

save_data(prediction_df, file_path)

[1m[35m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[35m║[0m[1m[95m File `log_reg_predict.zip` already exists. Overwriting file.  [0m[1m[35m║[0m
[1m[35m╚═══════════════════════════════════════════════════════════════╝[0m
[1m[32m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[32m║[0m[1m[92m        File saved and zipped as `log_reg_predict.zip`         [0m[1m[32m║[0m
[1m[32m╚═══════════════════════════════════════════════════════════════╝[0m
