In [1]:
# Import standard library modules
import sys

# Set the relative path to the project root directory
relative_path_to_root = "../../../"

# Add the project root to the system path for importing in-house modules
sys.path.append(relative_path_to_root)

# Import in-house modules from the 'utilities' package
from utilities import clean_historical_data, check_tickers_for_missing_values
from utilities import calc_vif, calc_p_values, calc_correlation, highlight_vif, highlight_p_values, evaluate_regression_model, evaluate_cross_validation, evaluate_classifier_model
from utilities import load_data, save_data

In [2]:
# Data manipulation and analysis
import pandas as pd

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [3]:
file_name = "updated_w_nas"
file_path = f"../../../data/raw_data/{file_name}"

raw_data = load_data(file_path)

[1m[36m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[36m║[0m[1m[96m File `updated_w_nas.csv.bz2` loaded from `updated_w_nas.zip`  [0m[1m[36m║[0m
[1m[36m╚═══════════════════════════════════════════════════════════════╝[0m


#### Inspect Tickers Individually

In [4]:
# Inspect Cleaned data zeroing in on individual tickers
select_df = raw_data.copy()

select_ticker = select_df["Ticker"] == "AAPL"

select_df[select_ticker].tail(3)

Unnamed: 0,Date,Ticker,Adjusted Close,Today to Tomorrow,Yesterday to Today,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
2119732,2024-10-23,AAPL,230.76,-1.0,-1.0,230.57,235.86,-0.021623,0.013733,57.298534,226.4908,221.05629,200.30862,237.57504,221.87895,216.32,236.48,short
2120233,2024-10-24,AAPL,230.57,1.0,-1.0,231.41,230.76,-0.000823,0.013688,55.555565,226.6678,221.42393,200.53928,237.66525,222.09375,216.32,236.48,buy
2120734,2024-10-25,AAPL,231.41,,1.0,,230.57,0.003643,0.01367,66.38572,226.8016,221.79678,200.76889,237.80995,222.31105,216.32,236.48,


### Data Pre-Processing
___

In [5]:
# Remove where Today to Tomorrow is "O"
value_filter = raw_data["Today to Tomorrow"] == 0

raw_data = raw_data.loc[~value_filter]

raw_data["Today to Tomorrow"].unique()

array([-1.,  1., nan])

In [6]:
def split_dataset_by_date(raw_data: pd.DataFrame, split_date: str) -> tuple:
    # Filter data by date range
    filter_data_by_date = raw_data["Date"] < split_date
    
    # Create a new dataframe with today's data
    historical_data = raw_data[filter_data_by_date].reset_index(drop=True)
    
    # Create a new dataframe with historical data (excluding today's data)
    predict_data = raw_data[~filter_data_by_date].reset_index(drop=True)
    
    return historical_data, predict_data

#### Split todays data (For prediction) and historical data (For training)

In [7]:
# todays_date = "2024-10-25"
split_date = "2024-09-30"

historical_data, predict_data = split_dataset_by_date(raw_data, split_date)

print("Split Date:", split_date)

Split Date: 2024-09-30


#### Ensure Missing Values on Todays Data is what we are Predicting

In [8]:
predict_data = clean_historical_data(predict_data)

print("Start Date:", predict_data["Date"].min())
print("End Date:", predict_data["Date"].max())
predict_data.isnull().sum()

Start Date: 2024-09-30
End Date: 2024-10-25


Date                    0
Ticker                  0
Adjusted Close          0
Today to Tomorrow     501
Yesterday to Today      0
Next Day Close        501
Previous Day Close      0
Return                  0
Volatility              0
RSI                     0
SMA_50                  0
SMA_100                 0
SMA_200                 0
Upper Band              0
Lower Band              0
Support                 0
Resistance              0
Action                501
dtype: int64

#### Handle Missing Values (NA's)


In [9]:
historical_data = clean_historical_data(historical_data)

print("Start Date:", historical_data["Date"].min())
print("End Date:", historical_data["Date"].max())
historical_data.isnull().sum()

Start Date: 2008-01-02
End Date: 2024-09-27


Date                  0
Ticker                0
Adjusted Close        0
Today to Tomorrow     0
Yesterday to Today    0
Next Day Close        0
Previous Day Close    0
Return                0
Volatility            0
RSI                   0
SMA_50                0
SMA_100               0
SMA_200               0
Upper Band            0
Lower Band            0
Support               0
Resistance            0
Action                0
dtype: int64

In [10]:
# Replace -1 values with 0
historical_data["Today to Tomorrow"] = historical_data["Today to Tomorrow"].replace(-1, 0)

# Convert floating point values to integers
historical_data["Today to Tomorrow"] = historical_data["Today to Tomorrow"].astype(int)

historical_data["Today to Tomorrow"].unique()

array([0, 1])

In [11]:
tickers_no_missing_values, tickers_with_missing_values = check_tickers_for_missing_values(historical_data)

[1m[90m╔═══════════════════════════════════════════════════════════════╗[0m
[1m[90m║[0m[1m[97m          Tickers that do not have any missing values          [0m[1m[90m║[0m
[1m[90m╠═══════════════════════════════════════════════════════════════╣[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of unique tickers:      |          500.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with no m... |          500.00000          [0m[1m[90m ║[0m[0m
[1m[97m[1m[90m║ [0m[1m[97mNumber of tickers with miss... |           0.00000           [0m[1m[90m ║[0m[0m
[1m[90m╚═══════════════════════════════════════════════════════════════╝[0m


### Exploratory Data Analysis (EDA):
___

#### Predict Data:

In [12]:
print("Shape:", predict_data.shape)

Shape: (9963, 18)


#### Historical Data:

In [13]:
print("Shape:", historical_data.shape)

Shape: (1954741, 18)


### Create Multiple Versions of Dataset
___

In [14]:
# Data with dates and without tickers (Set as index for reference)
def prepare_data_v2(main_data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = main_data.copy().reset_index(drop=True)
    
    # Convert the `Date` column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from the `Date` column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Set the index to `Date` and `Ticker`
    df = df.set_index(["Date", "Ticker"])
    
    return df

main_data = prepare_data_v2(historical_data.copy().reset_index(drop=True))

#### Select which version of the data to work with

In [15]:
main_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Yesterday to Today,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2008-01-02,A,23.256380,0,-1.0,23.025750,23.538280,-0.011976,0.015704,48.827488,23.314175,23.299887,23.564934,24.727251,22.540232,21.392029,24.351933,short,2008,1,2
2008-01-02,AAPL,5.876341,1,-1.0,5.879055,5.974061,-0.016357,0.018937,59.067430,5.518483,4.939064,4.197630,6.135834,5.403559,4.637377,6.026839,buy,2008,1,2
2008-01-02,ABT,18.130203,0,-1.0,18.019758,18.240652,-0.006055,0.010484,34.677357,18.138458,17.628250,17.709028,19.233109,18.221800,16.775562,19.134012,short,2008,1,2
2008-01-02,ACGL,7.608889,1,-1.0,7.764444,7.816667,-0.026581,0.016022,45.154190,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778,buy,2008,1,2
2008-01-02,ACN,26.437070,0,1.0,25.982527,26.415077,0.000833,0.024039,54.812187,26.577982,27.784420,28.471031,28.227200,24.273775,24.765510,29.215685,sell,2008,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-27,XYL,134.510000,1,1.0,135.030000,134.130000,0.002833,0.015274,70.569610,133.008790,135.843140,128.534210,137.803190,126.457810,125.805660,141.154860,hold,2024,9,27
2024-09-27,YUM,139.920000,0,1.0,139.710000,138.070000,0.013399,0.012814,66.764175,133.453030,133.806440,133.268300,138.392100,128.823900,125.324680,139.920000,sell,2024,9,27
2024-09-27,ZBH,107.980000,0,1.0,107.950000,107.471130,0.004735,0.020791,70.106766,109.309240,110.604160,116.869156,114.414790,100.352870,104.238320,115.912370,sell,2024,9,27
2024-09-27,ZBRA,368.600000,1,-1.0,370.320000,371.240000,-0.007111,0.018034,83.407295,339.336200,326.619300,300.337160,379.808500,314.348500,314.650000,371.240000,buy,2024,9,27


In [16]:
select_data = main_data.copy()

# select_columns_to_drop = ["Action", "Next Day Close"]
select_columns_to_drop = ["Action", "Next Day Close", "Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]

data = select_data.drop(columns=select_columns_to_drop)

print("Shape:", data.shape)
data.head()

Shape: (1954741, 12)


Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted Close,Today to Tomorrow,Yesterday to Today,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2008-01-02,A,23.25638,0,-1.0,-0.011976,0.015704,48.827488,23.299887,22.540232,21.392029,2008,1,2
2008-01-02,AAPL,5.876341,1,-1.0,-0.016357,0.018937,59.06743,4.939064,5.403559,4.637377,2008,1,2
2008-01-02,ABT,18.130203,0,-1.0,-0.006055,0.010484,34.677357,17.62825,18.2218,16.775562,2008,1,2
2008-01-02,ACGL,7.608889,1,-1.0,-0.026581,0.016022,45.15419,7.878933,7.378535,7.463333,2008,1,2
2008-01-02,ACN,26.43707,0,1.0,0.000833,0.024039,54.812187,27.78442,24.273775,24.76551,2008,1,2


### Split data features `X` and target `y`
___

In [17]:
target = "Today to Tomorrow"

# Split the data into features (X) and target (y)
X = data.drop(columns=target)

y = data[target]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1954741, 11)
Shape of y: (1954741,)


In [18]:
y.value_counts()

Today to Tomorrow
1    1024076
0     930665
Name: count, dtype: int64

### Feature Engineering
___

#### Inspect Multicollinearity using VIF

In [19]:
# Perform correlation matrix of X
calc_correlation(X)

Unnamed: 0,Adjusted Close,Yesterday to Today,Return,Volatility,RSI,SMA_100,Lower Band,Support,Year,Month,Day
Adjusted Close,1.0,0.008107,0.004731,-0.049938,0.023563,0.995333,0.998273,0.996687,0.260676,0.001491,0.000905
Yesterday to Today,0.008107,1.0,0.633241,-0.013527,0.202777,0.002716,0.003129,0.003321,0.010942,0.000118,-0.008592
Return,0.004731,0.633241,1.0,0.0272,0.215132,-0.001698,-0.002069,-0.001509,0.001738,0.002961,-0.000139
Volatility,-0.049938,-0.013527,0.0272,1.0,-0.096098,-0.038884,-0.059617,-0.05788,-0.13665,0.010435,-0.003628
RSI,0.023563,0.202777,0.215132,-0.096098,1.0,0.000156,0.007298,0.004226,0.010794,-0.002865,0.003951
SMA_100,0.995333,0.002716,-0.001698,-0.038884,0.000156,1.0,0.995418,0.997022,0.263592,0.002242,0.001177
Lower Band,0.998273,0.003129,-0.002069,-0.059617,0.007298,0.995418,1.0,0.997968,0.260214,0.001318,0.00093
Support,0.996687,0.003321,-0.001509,-0.05788,0.004226,0.997022,0.997968,1.0,0.260756,0.001808,0.00125
Year,0.260676,0.010942,0.001738,-0.13665,0.010794,0.263592,0.260214,0.260756,1.0,-0.036901,0.000166
Month,0.001491,0.000118,0.002961,0.010435,-0.002865,0.002242,0.001318,0.001808,-0.036901,1.0,2.8e-05


##### **Note: It is recommended to remove `["Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]` after VIF inspection...**

In [20]:
# Perform Variance Inflation Factor (VIF) analysis
vif = calc_vif(X)

vif.style.apply(lambda x: highlight_vif(x, threshold=2000))

Unnamed: 0,VIF
Yesterday to Today,1.688156
Return,1.702374
Volatility,3.432712
Day,4.229874
Month,4.622466
RSI,13.151201
Year,23.603299
SMA_100,234.216872
Adjusted Close,430.123894
Support,465.656608


### Data Splitting
___

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1563792, 11)
Shape of X_test: (390949, 11)
Shape of y_train: (1563792,)
Shape of y_test: (390949,)


#### Inspect Probability Values `(p-values)`|

##### **Note: It is recommended to remove `["Day"]` after p-value inspection...**

In [22]:
# Check P-Values
p_values, ols_model = calc_p_values(X_train, y_train)

p_values.style.apply(highlight_p_values)

Unnamed: 0,p_value
Year,0.0
Volatility,0.0
Return,0.0
RSI,0.0
Yesterday to Today,0.0
Adjusted Close,0.161738
Support,0.268161
Day,0.305965
Lower Band,0.449792
SMA_100,0.463475


### Model Training
___


#### Scale the data using `StandardScaler`

In [23]:
# Scale using StandardScaler
X_scaler = StandardScaler()
# y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
# print("y_train_scaled shape:", y_train_scaled.shape)
# print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1563792, 11)
X_test_scaled shape: (390949, 11)


In [None]:
# Import KNNeighborsClassifier from sklearn
from sklearn.neighbors import KNeighborsClassifier

train_scores = []
test_scores = []

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)

    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)

    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

knn_df = pd.DataFrame(
    {
        "train_score": train_scores,
        "test_score": test_scores
    },
    index=range(1, 20, 2)
)

knn_df.index.name = "k"

knn_df

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.9s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.9s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.9s


In [None]:
import matplotlib.pyplot as plt

k_values = range(1, 20, 2)

fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(k_values, train_scores, marker="o", label="Train Score")
ax.plot(k_values, test_scores, marker="x", label="Test Score")

ax.set(
    xlabel="k (Number of Neighbors)",
    ylabel="Score",
    title="KNN Model Performance"
    xticks=k_values
)

ax.grid(True)
ax.legend()

plt.show()

In [None]:
print(

In [None]:
import json

print(json.dumps(grid_search.best_params_, indent=4))

In [None]:
best_xgb_model = grid_search.best_estimator_

In [None]:
import pickle

# Save model path
model_name = "XGBClassifier_v1.pkl"
model_path = f"../../../models/{model_name}"

# Save the model to a file
with open(model_path, "wb") as file:
    pickle.dump(best_xgb_model, file)

print(f"Model saved to {model_path}")

In [24]:
# Predict the target values using the testing data
y_test_predict = best_xgb_model.predict(X_test_scaled)
y_train_predict = best_xgb_model.predict(X_train_scaled)

In [None]:
confusion_train, confusion_test = evaluate_classifier_model(
    "XGBoost Classifier",
    y_train,
    y_test,
    y_train_predict,
    y_test_predict
)

In [None]:
import numpy as np

classes = np.unique(y_train)
print("Class order:", classes)

In [None]:
df_confusion_train = pd.DataFrame(
    confusion_train,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

df_confusion_train

In [None]:
df_confusion_test = pd.DataFrame(
    confusion_test,
    index=["Actual -1", "Actual 1"],
    columns=["Predicted -1", "Predicted 1"]
)

df_confusion_test

In [29]:
cv_scores = cross_val_score(
    best_xgb_model,
    X_train_scaled,
    y_train,  # Use the original y_train with values 1, -1, and 0
    scoring="accuracy",  # Use accuracy for classification
    cv=5
)

In [None]:
evaluate_cross_validation(cv_scores, "Logistic Regression")

In [None]:
import pickle

# Save model path
model_name = "logistic_regression_model.pkl"
model_path = f"../../../models/{model_name}"

# Save the model to a file
with open(model_path, "wb") as file:
    pickle.dump(best_xgb_model, file)

print(f"Model saved to {model_path}")

### Predict `Adjusted Close`
___
___
___

In [31]:
def preprocess_predict_data(df, columns_to_drop):
    # Convert 'Date' column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from 'Date' column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Drop specified columns
    df = df.drop(columns=columns_to_drop)
    
    # Set 'Date' and 'Ticker' as the index
    df = df.set_index(["Date", "Ticker"])
    
    df = df.drop(columns="Today to Tomorrow")
    
    return df

In [None]:
# Make a copy of the data we will use to predict
X_to_predict = predict_data.copy()

# Preprocess the data for prediction
X_to_predict = preprocess_predict_data(X_to_predict, select_columns_to_drop)

# Scale the data using the StandardScaler
X_to_predict_scaled = X_scaler.transform(X_to_predict)

print("Shape of predict data:", X_to_predict_scaled.shape)
X_to_predict_scaled

In [None]:
# Predict the target values using the testing data
y_to_predict = best_xgb_model.predict(X_to_predict_scaled)

print("Shape of y_to_predict:", y_to_predict.shape)
y_to_predict[:5]

In [None]:
# Create a copy of the predict data so we may add the predictions
prediction_df = predict_data.copy()

# Add the predictions to the predict data made by the model
prediction_df["Pred Today to Tomorrow"] = y_to_predict

# Make a column "Correct Prediction" to check if the prediction is correct by comparing the actual value and the predicted value
prediction_df["Is Correct Prediction"] = prediction_df["Today to Tomorrow"] == prediction_df["Pred Today to Tomorrow"]

# Select the columns to keep
prediction_df = prediction_df[
    [
        'Date',
        'Ticker',
        'Adjusted Close',
        'Today to Tomorrow',
        'Pred Today to Tomorrow',
        'Is Correct Prediction'
    ]
]

# Drop rows with missing values
prediction_df.dropna(inplace=True)

# Set the index to 'Date' and 'Ticker' to better group the data
prediction_df = prediction_df.set_index(["Date", "Ticker"])

print("Shape:", prediction_df.shape)
display(prediction_df.head())
display(prediction_df.tail())

In [None]:
prediction_df["Is Correct Prediction"].value_counts()

In [None]:
# Inspect Cleaned data zeroing in on individual tickers
select_df = prediction_df.copy().reset_index()

select_ticker = select_df["Ticker"] == "AAPL"

display(select_df[select_ticker].set_index(["Date", "Ticker"])["Is Correct Prediction"].value_counts())
select_df[select_ticker].set_index(["Date", "Ticker"]).tail()

In [None]:
# Make index column a regular column
prediction_df.reset_index(inplace=True)

prediction_df

In [None]:
file_name = "log_reg_predict.zip"
file_path = f"../../../data/raw_data/{file_name}"

save_data(prediction_df, file_path)