In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import clean_historical_data, check_tickers_for_missing_values
from utilities import calc_vif, calc_p_values, calc_correlation, highlight_vif, highlight_p_values, evaluate_regression_model, evaluate_cross_validation, evaluate_classifier_model
from utilities import load_data, save_data

In [2]:
# Data manipulation and analysis
import pandas as pd

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [3]:
file_name = "updated_w_nas"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96m File `updated_w_nas.csv.bz2` loaded from `updated_w_nas.zip`  [0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


#### Inspect Tickers Individually

In [4]:
# Inspect Cleaned data zeroing in on individual tickers
select_df = raw_data.copy()

select_ticker = select_df["Ticker"] == "AAPL"

select_df[select_ticker].tail(3)

Unnamed: 0,Date,Ticker,Adjusted Close,Today to Tomorrow,Yesterday to Today,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
2120233,2024-10-24,AAPL,230.57,1.0,-1.0,231.41,230.76,-0.000823,0.013688,55.555565,226.6678,221.42393,200.53928,237.66525,222.09375,216.32,236.48,buy
2120734,2024-10-25,AAPL,231.41,1.0,1.0,233.49,230.57,0.003643,0.01367,66.38572,226.8016,221.79678,200.76889,237.80995,222.31105,216.32,236.48,hold
2121235,2024-10-28,AAPL,233.49,,1.0,,231.41,0.008988,0.013784,63.955185,226.9504,222.17525,201.0119,237.87657,222.29343,216.32,236.48,


### Data Pre-Processing
___

In [5]:
# Remove where Today to Tomorrow is "O"
value_filter = raw_data["Today to Tomorrow"] == 0

raw_data = raw_data.loc[~value_filter]

raw_data["Today to Tomorrow"].unique()

array([-1.,  1., nan])

In [6]:
def split_dataset_by_date(raw_data: pd.DataFrame, split_date: str) -> tuple:
    # Filter data by date range
    filter_data_by_date = raw_data["Date"] < split_date
    
    # Create a new dataframe with today's data
    historical_data = raw_data[filter_data_by_date].reset_index(drop=True)
    
    # Create a new dataframe with historical data (excluding today's data)
    predict_data = raw_data[~filter_data_by_date].reset_index(drop=True)
    
    return historical_data, predict_data

#### Split todays data (For prediction) and historical data (For training)

In [7]:
# todays_date = "2024-10-25"
split_date = "2024-09-30"

historical_data, predict_data = split_dataset_by_date(raw_data, split_date)

print("Split Date:", split_date)

Split Date: 2024-09-30


#### Ensure Missing Values on Todays Data is what we are Predicting

In [8]:
predict_data = clean_historical_data(predict_data)

print("Start Date:", predict_data["Date"].min())
print("End Date:", predict_data["Date"].max())
predict_data.isnull().sum()

Start Date: 2024-09-30
End Date: 2024-10-28


Date                    0
Ticker                  0
Adjusted Close          0
Today to Tomorrow     501
Yesterday to Today      0
Next Day Close        501
Previous Day Close      0
Return                  0
Volatility              0
RSI                     0
SMA_50                  0
SMA_100                 0
SMA_200                 0
Upper Band              0
Lower Band              0
Support                 0
Resistance              0
Action                501
dtype: int64

#### Handle Missing Values (NA's)


In [9]:
historical_data = clean_historical_data(historical_data)

print("Start Date:", historical_data["Date"].min())
print("End Date:", historical_data["Date"].max())
historical_data.isnull().sum()

Start Date: 2008-01-02
End Date: 2024-09-27


Date                  0
Ticker                0
Adjusted Close        0
Today to Tomorrow     0
Yesterday to Today    0
Next Day Close        0
Previous Day Close    0
Return                0
Volatility            0
RSI                   0
SMA_50                0
SMA_100               0
SMA_200               0
Upper Band            0
Lower Band            0
Support               0
Resistance            0
Action                0
dtype: int64

In [10]:
# Replace -1 values with 0
historical_data["Today to Tomorrow"] = historical_data["Today to Tomorrow"].replace(-1, 0)

# Convert floating point values to integers
historical_data["Today to Tomorrow"] = historical_data["Today to Tomorrow"].astype(int)

historical_data["Today to Tomorrow"].unique()

array([0, 1])

In [11]:
tickers_no_missing_values, tickers_with_missing_values = check_tickers_for_missing_values(historical_data)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m          Tickers that do not have any missing values          [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of unique tickers:      |          500.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with no m... |          500.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with miss... |           0.00000           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


### Exploratory Data Analysis (EDA):
___

#### Predict Data:

In [12]:
print("Shape:", predict_data.shape)

Shape: (10460, 18)


#### Historical Data:

In [13]:
print("Shape:", historical_data.shape)

Shape: (1954743, 18)


### Create Multiple Versions of Dataset
___

In [14]:
# Data with dates and without tickers (Set as index for reference)
def prepare_data_v2(main_data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = main_data.copy().reset_index(drop=True)
    
    # Convert the `Date` column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from the `Date` column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Set the index to `Date` and `Ticker`
    df = df.set_index(["Date", "Ticker"])
    
    return df

main_data = prepare_data_v2(historical_data.copy().reset_index(drop=True))

#### Select which version of the data to work with

In [15]:
main_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Yesterday to Today,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2008-01-02,A,23.256388,0,-1.0,23.025745,23.538284,-0.011976,0.015704,48.827595,23.314173,23.299887,23.564934,24.727250,22.540232,21.392027,24.351938,short,2008,1,2
2008-01-02,AAPL,5.876342,1,-1.0,5.879056,5.974059,-0.016357,0.018937,59.067417,5.518483,4.939064,4.197630,6.135834,5.403559,4.637375,6.026838,buy,2008,1,2
2008-01-02,ABT,18.130213,0,-1.0,18.019760,18.240650,-0.006054,0.010484,34.677270,18.138458,17.628250,17.709028,19.233109,18.221802,16.775555,19.134012,short,2008,1,2
2008-01-02,ACGL,7.608889,1,-1.0,7.764444,7.816667,-0.026581,0.016022,45.154190,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778,buy,2008,1,2
2008-01-02,ACN,26.437065,0,1.0,25.982527,26.415085,0.000832,0.024039,54.812172,26.577984,27.784422,28.471031,28.227201,24.273775,24.765514,29.215681,sell,2008,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-27,XYL,134.510000,1,1.0,135.030000,134.130000,0.002833,0.015274,70.569610,133.008790,135.843140,128.534210,137.803190,126.457810,125.805660,141.154860,hold,2024,9,27
2024-09-27,YUM,139.920000,0,1.0,139.710000,138.070000,0.013399,0.012814,66.764175,133.453030,133.806440,133.268300,138.392100,128.823900,125.324680,139.920000,sell,2024,9,27
2024-09-27,ZBH,107.980000,0,1.0,107.950000,107.471130,0.004735,0.020791,70.106766,109.309240,110.604160,116.869156,114.414790,100.352870,104.238320,115.912370,sell,2024,9,27
2024-09-27,ZBRA,368.600000,1,-1.0,370.320000,371.240000,-0.007111,0.018034,83.407295,339.336200,326.619300,300.337160,379.808500,314.348500,314.650000,371.240000,buy,2024,9,27


In [16]:
select_data = main_data.copy()

# select_columns_to_drop = ["Action", "Next Day Close"]
select_columns_to_drop = ["Action", "Next Day Close", "Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]

data = select_data.drop(columns=select_columns_to_drop)

print("Shape:", data.shape)
data.head()

Shape: (1954743, 12)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Yesterday to Today,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2008-01-02,A,23.256388,0,-1.0,-0.011976,0.015704,48.827595,23.299887,22.540232,21.392027,2008,1,2
2008-01-02,AAPL,5.876342,1,-1.0,-0.016357,0.018937,59.067417,4.939064,5.403559,4.637375,2008,1,2
2008-01-02,ABT,18.130213,0,-1.0,-0.006054,0.010484,34.67727,17.62825,18.221802,16.775555,2008,1,2
2008-01-02,ACGL,7.608889,1,-1.0,-0.026581,0.016022,45.15419,7.878933,7.378535,7.463333,2008,1,2
2008-01-02,ACN,26.437065,0,1.0,0.000832,0.024039,54.812172,27.784422,24.273775,24.765514,2008,1,2


### Split data features `X` and target `y`
___

In [17]:
target = "Today to Tomorrow"

# Split the data into features (X) and target (y)
X = data.drop(columns=target)

y = data[target]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1954743, 11)
Shape of y: (1954743,)


In [18]:
y.value_counts()

Today to Tomorrow
1    1024075
0     930668
Name: count, dtype: int64

### Feature Engineering
___

#### Inspect Multicollinearity using VIF

In [19]:
# Perform correlation matrix of X
calc_correlation(X)

Unnamed: 0,Adjusted Close,Yesterday to Today,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Adjusted Close,1.0,0.008106,0.004731,-0.049932,0.023569,0.995333,0.998273,0.996687,0.260678,0.001489,0.000905
Yesterday to Today,0.008106,1.0,0.633242,-0.013523,0.202782,0.002715,0.003127,0.00332,0.010943,0.000113,-0.008597
Return,0.004731,0.633242,1.0,0.027199,0.215122,-0.001698,-0.002068,-0.001509,0.001729,0.00296,-0.000139
Volatility,-0.049932,-0.013523,0.027199,1.0,-0.096124,-0.038877,-0.059611,-0.057873,-0.136661,0.010419,-0.003631
RSI,0.023569,0.202782,0.215122,-0.096124,1.0,0.000162,0.007304,0.004232,0.010779,-0.002875,0.003973
SMA_100,0.995333,0.002715,-0.001698,-0.038877,0.000162,1.0,0.995418,0.997022,0.263594,0.00224,0.001177
Lower Band,0.998273,0.003127,-0.002068,-0.059611,0.007304,0.995418,1.0,0.997968,0.260216,0.001316,0.000931
Support,0.996687,0.00332,-0.001509,-0.057873,0.004232,0.997022,0.997968,1.0,0.260757,0.001807,0.00125
Year,0.260678,0.010943,0.001729,-0.136661,0.010779,0.263594,0.260216,0.260757,1.0,-0.036899,0.000165
Month,0.001489,0.000113,0.00296,0.010419,-0.002875,0.00224,0.001316,0.001807,-0.036899,1.0,2.8e-05


##### **Note: It is recommended to remove `["Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]` after VIF inspection...**

In [20]:
# Perform Variance Inflation Factor (VIF) analysis
vif = calc_vif(X)

vif.style.apply(lambda x: highlight_vif(x, threshold=2000))

Unnamed: 0,VIF
Yesterday to Today,1.688163
Return,1.70237
Volatility,3.432675
Day,4.229853
Month,4.622485
RSI,13.151686
Year,23.603973
SMA_100,234.217598
Adjusted Close,430.127732
Support,465.657642


### Data Splitting
___

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1563794, 11)
Shape of X_test: (390949, 11)
Shape of y_train: (1563794,)
Shape of y_test: (390949,)


#### Inspect Probability Values `(p-values)`|

##### **Note: It is recommended to remove `["Day"]` after p-value inspection...**

In [22]:
# Check P-Values
p_values, ols_model = calc_p_values(X_train, y_train)

p_values.style.apply(highlight_p_values)

Unnamed: 0,p_value
Year,0.0
Volatility,0.0
Return,0.0
RSI,0.0
Yesterday to Today,0.0
Support,0.01513
Adjusted Close,0.042993
SMA_100,0.285761
Month,0.569993
Day,0.583266


### Model Training
___


#### Scale the data using `StandardScaler`

In [23]:
# Scale using StandardScaler
X_scaler = StandardScaler()
# y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
# print("y_train_scaled shape:", y_train_scaled.shape)
# print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1563794, 11)
X_test_scaled shape: (390949, 11)


In [24]:
# Save X_xcaler model

# import joblib 

# # Save the model as a pickle file
# file_name = "X_scaler.pkl"
# file_path = f"../../../models/{file_name}"

# joblib.dump(X_scaler, file_path)


['../../../models/X_scaler.pkl']

In [25]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(
    colsample_bytree=1.0,
    gamma=0,
    learning_rate=0.15,
    max_depth=5,
    n_estimators=300,
    subsample=1.0,
    n_jobs=-1
)

xgb_clf.fit(X_train_scaled, y_train)

In [26]:
# import pickle

# # Save model path
# model_name = "XGBClassifier_v1.pkl"
# model_path = f"../../../models/{model_name}"

# # Save the model to a file
# with open(model_path, "wb") as file:
#     pickle.dump(xgb_clf, file)

# print(f"Model saved to {model_path}")

In [27]:
# Predict the target values using the testing data
y_test_predict = xgb_clf.predict(X_test_scaled)
y_train_predict = xgb_clf.predict(X_train_scaled)

In [28]:
confusion_train, confusion_test = evaluate_classifier_model(
    "XGBoost Classifier",
    y_train,
    y_test,
    y_train_predict,
    y_test_predict
)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[96m              XGBoost Classifier Model Evaluation              [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97m                               |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[33mTraining Data Metrics          |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mAccuracy (Train):              |           0.62941           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mPrecision (Train):             |           0.63125           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mRecall (Train):                |           0.62941           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[96mF1 Score (Train):              |           0.62379           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

In [29]:
import numpy as np

classes = np.unique(y_train)
print("Class order:", classes)

Class order: [0 1]


In [30]:
df_confusion_train = pd.DataFrame(
    confusion_train,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

df_confusion_train

Unnamed: 0,Predicted -1,Predicted 1
Actual -1,376657,368414
Actual 1,211105,607618


In [31]:
df_confusion_test = pd.DataFrame(
    confusion_test,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

df_confusion_test

Unnamed: 0,Predicted -1,Predicted 1
Actual -1,93033,92564
Actual 1,53751,151601


In [32]:
cv_scores = cross_val_score(
    xgb_clf,
    X_train_scaled,
    y_train,  # Use the original y_train with values 1, -1, and 0
    scoring="accuracy",  # Use accuracy for classification
    cv=5
)

In [33]:
evaluate_cross_validation(cv_scores, "XGBoost Classifier")

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[92m          XGBoost Classifier Cross Validation Scores           [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 1:                        |           0.62660           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 2:                        |           0.62530           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 3:                        |           0.62493           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 4:                        |           0.62428           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[34mFold 5:                        |           0.62755           [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97m                               |                             [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m

### Predict `Adjusted Close`
___
___
___

In [34]:
def preprocess_predict_data(df, columns_to_drop):
    # Convert 'Date' column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from 'Date' column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Drop specified columns
    df = df.drop(columns=columns_to_drop)
    
    # Set 'Date' and 'Ticker' as the index
    df = df.set_index(["Date", "Ticker"])
    
    df = df.drop(columns="Today to Tomorrow")
    
    return df

In [35]:
# Make a copy of the data we will use to predict
X_to_predict = predict_data.copy()

# Preprocess the data for prediction
X_to_predict = preprocess_predict_data(X_to_predict, select_columns_to_drop)

# Scale the data using the StandardScaler
X_to_predict_scaled = X_scaler.transform(X_to_predict)

print("Shape of predict data:", X_to_predict_scaled.shape)
X_to_predict_scaled

Shape of predict data: (10460, 11)


array([[0.30149681, 0.95644196, 0.0950344 , ..., 1.63653275, 0.73933339,
        1.63152031],
       [0.7248706 , 0.95644196, 0.984746  , ..., 1.63653275, 0.73933339,
        1.63152031],
       [0.53962584, 0.95644196, 0.58231546, ..., 1.63653275, 0.73933339,
        1.63152031],
       ...,
       [0.07824101, 0.95644196, 0.48971684, ..., 1.63653275, 1.03295107,
        1.40292582],
       [1.37689502, 0.95644196, 0.39329307, ..., 1.63653275, 1.03295107,
        1.40292582],
       [0.47436095, 0.95644196, 0.66967534, ..., 1.63653275, 1.03295107,
        1.40292582]])

In [36]:
# Predict the target values using the testing data
y_to_predict = xgb_clf.predict(X_to_predict_scaled)

print("Shape of y_to_predict:", y_to_predict.shape)
y_to_predict[:5]

Shape of y_to_predict: (10460,)


array([1, 0, 1, 1, 1])

In [37]:
# Create a copy of the predict data so we may add the predictions
prediction_df = predict_data.copy()

# Add the predictions to the predict data made by the model
prediction_df["Pred Today to Tomorrow"] = y_to_predict

# Replace 0 with -1
prediction_df["Pred Today to Tomorrow"] = prediction_df["Pred Today to Tomorrow"].replace(0, -1)

# Make a column "Correct Prediction" to check if the prediction is correct by comparing the actual value and the predicted value
prediction_df["Is Correct Prediction"] = prediction_df["Today to Tomorrow"] == prediction_df["Pred Today to Tomorrow"]

# Select the columns to keep
prediction_df = prediction_df[
    [
        'Date',
        'Ticker',
        'Adjusted Close',
        'Today to Tomorrow',
        'Pred Today to Tomorrow',
        'Is Correct Prediction'
    ]
]

# Drop rows with missing values
prediction_df.dropna(inplace=True)

# Set the index to 'Date' and 'Ticker' to better group the data
prediction_df = prediction_df.set_index(["Date", "Ticker"])

print("Shape:", prediction_df.shape)
display(prediction_df.head())
display(prediction_df.tail())

Shape: (9959, 4)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Pred Today to Tomorrow,Is Correct Prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-30,A,148.244,-1.0,1,False
2024-09-30,AAPL,233.0,-1.0,-1,True
2024-09-30,ABBV,195.9155,-1.0,1,False
2024-09-30,ABNB,126.81,-1.0,1,False
2024-09-30,ABT,113.475204,-1.0,1,False


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Pred Today to Tomorrow,Is Correct Prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-10-25,XYL,130.42,1.0,1,True
2024-10-25,YUM,133.04,1.0,1,True
2024-10-25,ZBH,102.35,1.0,1,True
2024-10-25,ZBRA,360.09,1.0,1,True
2024-10-25,ZTS,180.01,1.0,1,True


In [38]:
prediction_df["Is Correct Prediction"].value_counts()

Is Correct Prediction
True     5536
False    4423
Name: count, dtype: int64

In [39]:
# Inspect Cleaned data zeroing in on individual tickers
select_df = prediction_df.copy().reset_index()

select_ticker = select_df["Ticker"] == "ZTS"

display(select_df[select_ticker].set_index(["Date", "Ticker"])["Is Correct Prediction"].value_counts())
select_df[select_ticker].set_index(["Date", "Ticker"]).tail()

Is Correct Prediction
True     11
False     9
Name: count, dtype: int64

Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Pred Today to Tomorrow,Is Correct Prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-10-21,ZTS,189.45,1.0,1,True
2024-10-22,ZTS,189.51,-1.0,1,False
2024-10-23,ZTS,188.99,-1.0,1,False
2024-10-24,ZTS,181.5,-1.0,1,False
2024-10-25,ZTS,180.01,1.0,1,True


In [40]:
# Make index column a regular column
prediction_df.reset_index(inplace=True)

prediction_df

Unnamed: 0,Date,Ticker,Adjusted Close,Today to Tomorrow,Pred Today to Tomorrow,Is Correct Prediction
0,2024-09-30,A,148.244000,-1.0,1,False
1,2024-09-30,AAPL,233.000000,-1.0,-1,True
2,2024-09-30,ABBV,195.915500,-1.0,1,False
3,2024-09-30,ABNB,126.810000,-1.0,1,False
4,2024-09-30,ABT,113.475204,-1.0,1,False
...,...,...,...,...,...,...
9954,2024-10-25,XYL,130.420000,1.0,1,True
9955,2024-10-25,YUM,133.040000,1.0,1,True
9956,2024-10-25,ZBH,102.350000,1.0,1,True
9957,2024-10-25,ZBRA,360.090000,1.0,1,True


In [41]:
# file_name = "XGB_clf_predict.zip"
# file_path = f"../../../data/raw_data/{file_name}"

# save_data(prediction_df, file_path)