In [63]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from pathlib import Path

In [65]:
# Import data
file_path = Path("Resources/sp500_adj_close_raw.csv")
df = pd.read_csv(file_path)
# Convert date to datetime data type
df["Date"] = pd.to_datetime(df["Date"])
df.set_index('Date', inplace=True)
df.shape
df.tail()

Unnamed: 0_level_0,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2024-10-25,XYL,130.41,130.41,130.65,-0.001837,0.009297,39.056873,133.51048,134.41891,130.8492,138.39948,129.94153,126.71,137.53,short
2024-10-25,YUM,133.04,133.04,133.16,-0.000901,0.006858,34.395523,134.61107,133.44423,134.0083,139.50783,130.66417,129.71,139.92,short
2024-10-25,ZBH,102.33,102.33,104.0,-0.016058,0.01061,50.197327,107.36608,107.9697,115.37445,107.723175,101.30882,101.77,115.91237,short
2024-10-25,ZBRA,359.97,359.97,362.05,-0.005745,0.009788,43.769238,355.8908,336.8905,311.04135,380.01144,359.82355,320.77,377.68,short
2024-10-25,ZTS,179.91,179.91,181.5,-0.00876,0.012576,36.496883,189.094,183.3149,179.2437,197.88783,182.27017,179.91,196.48,short


In [66]:
# Do some feature engineering
df['Year'] = df.index.year
df['Month'] = df.index.month
df['Day'] = df.index.day
df['Day_of_Week'] = df.index.dayofweek
df.tail()

Unnamed: 0_level_0,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action,Year,Month,Day,Day_of_Week
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2024-10-25,XYL,130.41,130.41,130.65,-0.001837,0.009297,39.056873,133.51048,134.41891,130.8492,138.39948,129.94153,126.71,137.53,short,2024,10,25,4
2024-10-25,YUM,133.04,133.04,133.16,-0.000901,0.006858,34.395523,134.61107,133.44423,134.0083,139.50783,130.66417,129.71,139.92,short,2024,10,25,4
2024-10-25,ZBH,102.33,102.33,104.0,-0.016058,0.01061,50.197327,107.36608,107.9697,115.37445,107.723175,101.30882,101.77,115.91237,short,2024,10,25,4
2024-10-25,ZBRA,359.97,359.97,362.05,-0.005745,0.009788,43.769238,355.8908,336.8905,311.04135,380.01144,359.82355,320.77,377.68,short,2024,10,25,4
2024-10-25,ZTS,179.91,179.91,181.5,-0.00876,0.012576,36.496883,189.094,183.3149,179.2437,197.88783,182.27017,179.91,196.48,short,2024,10,25,4


In [69]:
# Initialize the LabelEncoder and fit it to the Action column:

le_action = LabelEncoder()

df['Action'] = le_action.fit_transform(df['Action'])

print(df)

           Ticker  Adjusted Close  Next Day Close  Previous Day Close  \
Date                                                                    
2008-01-02      A       23.256376       23.025747           23.538282   
2008-01-02   AAPL        5.876341        5.879056            5.974059   
2008-01-02    ABT       18.130210       18.019756           18.240660   
2008-01-02   ACGL        7.608889        7.764444            7.816667   
2008-01-02    ACN       26.437080       25.982517           26.415075   
...           ...             ...             ...                 ...   
2024-10-25    XYL      130.410000      130.410000          130.650000   
2024-10-25    YUM      133.040000      133.040000          133.160000   
2024-10-25    ZBH      102.330000      102.330000          104.000000   
2024-10-25   ZBRA      359.970000      359.970000          362.050000   
2024-10-25    ZTS      179.910000      179.910000          181.500000   

              Return  Volatility        RSI      S

In [75]:
# Define X and Y variables for train and test datasets
y = df['Action'].astype(int) # Ensure y is an integer
X = df.drop(columns=["Action", "Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200", "Next Day Close", "Ticker"])

In [25]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Scale the data using standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [77]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}


In [79]:
# Initialize and run GridSearchCV
grid_xgb = GridSearchCV(XGBClassifier(), param_grid, cv=5, scoring='accuracy', verbose=2)
grid_xgb.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.2s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.0s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.0s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.0s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_e

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 1062902 1062903 1062904], got [1.4628518e-01 1.4995374e-01 1.5591525e-01 ... 9.8118000e+03 9.8923000e+03
 9.9244000e+03]

--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 1063069 1063070 1063071], got [1.3527934e-01 1.4628518e-01 1.4995374e-01 ... 9.8755800e+03 9.8923000e+03
 9.9244000e+03]

--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 1063154 1063155 1063156], got [1.3527934e-01 1.5568592e-01 1.5591525e-01 ... 9.8755800e+03 9.8923000e+03
 9.9244000e+03]

--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 1062896 1062897 1062898], got [1.3527934e-01 1.4628518e-01 1.4995374e-01 ... 9.8118000e+03 9.8755800e+03
 9.9244000e+03]

--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\CJHx6\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [      0       1       2 ... 1063279 1063280 1063281], got [1.3527934e-01 1.4628518e-01 1.4995374e-01 ... 9.7735800e+03 9.8755800e+03
 9.8923000e+03]


In [33]:
# Best hyperparameters and corresponding score
print("Best Parameters:", grid_xgb.best_params_)
print("Best Cross-Validation Score:", grid_xgb.best_score_)

In [35]:
# Fit the model with the best found hyperparameters
best_xgb_classifier = grid_xgb.best_estimator_
best_xgb_classifier.fit(X_train_scaled, y_train)

In [37]:
# Predict on the test set using the best model
y_pred_class = best_xgb_classifier.predict(X_test_scaled)

Mean Absolute Error: 2.3314645132348946
Mean Squared Error: 30.799548032384507
R-squared: 0.9992100761596074


In [41]:
# Evaluate the classifier
print("Classification Report for Best XGB Classifier:")
print(classification_report(y_test, y_pred_class))

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}


In [45]:
# Generate and display the confusion matrix
conf_matrix_class = confusion_matrix(y_test, y_pred_class)
conf_df_class = pd.DataFrame(
    conf_matrix_class, 
    index=label_encoder.classes_, 
    columns=label_encoder.classes_
)
print("Confusion Matrix:")
print(conf_df_class)

In [47]:
# Predict actions for the last day and export to CSV:
last_day_class = df.loc[df.index == df.index.max()].drop(columns=["Action", "Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200", "Next Day Close"])


               Actual  Predicted
Date                            
2023-04-03  81.373910  80.421031
2010-08-24  17.004652  17.285588
2022-08-10  96.575066  93.743052
2017-10-10  54.291428  54.233566
2009-11-06  78.234310  76.039661
...               ...        ...
2010-07-12  31.008406  30.068058
2010-04-26  31.837200  32.178479
2011-01-19  20.599249  19.054517
2013-09-12  25.092500  24.186095
2020-12-16  52.828434  51.677488

[395480 rows x 2 columns]


In [49]:
# Preserve the Ticker column from the last day
last_day_tickers = last_day_class['Ticker']

NameError: name 'tickers_test' is not defined

In [None]:
# Drop the Ticker column before scaling
X_last_day_class = last_day_class.drop(columns=["Ticker"])

In [None]:
# Standardize the last day data
last_day_scaled = scaler.transform(X_last_day_class)
predicted_actions = best_xgb_classifier.predict(last_day_scaled)

In [None]:
# Add predictions and ticker back to the DataFrame
last_day_class['Predicted_Action'] = label_encoder.inverse_transform(predicted_actions)
last_day_class['Ticker'] = last_day_tickers

In [None]:
# Display the last day's predictions with Ticker
print(last_day_class[['Ticker', 'Adjusted Close', 'Predicted_Action']])

In [None]:
# Export the predictions
last_day_class.to_csv('predicted_actions_last_day_XGboost.csv')