In [13]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
import sqlalchemy
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from pandas.tseries.offsets import DateOffset
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report

In [14]:
# For creating the DB
database_connection_string = 'sqlite:///SP500.db'
engine = sqlalchemy.create_engine(database_connection_string)

sp500 = pd.read_sql_table('SectorDF', engine, parse_dates=True)
sp500.set_index('timestamp', inplace=True)
# sp500.dtypes

# # Review the DataFrame
display(sp500.head())
display(sp500.tail())

Unnamed: 0_level_0,SPY Open,SPY Close,Industrials,Health Care,Information Technology,Communication Services,Consumer Staples,Consumer Discretionary,Utilities,Financials,Materials,Real Estate,Energy,SPY
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-04-13 04:00:00,412.86,411.45,-0.001641,0.00651,4.5e-05,-0.000767,0.000146,-2e-06,0.016096,-0.004232,-0.00321,0.007684,0.00114,0.0
2021-04-14 04:00:00,413.79,415.87,-0.000804,0.000509,-0.004617,-0.004491,-0.001597,-0.002539,0.007733,0.010694,0.009282,-0.004979,0.020342,1.0
2021-04-15 04:00:00,417.29,417.26,0.000453,0.011926,0.004855,0.003417,0.005316,-0.004535,0.011355,-0.001082,0.004324,0.011913,-0.006888,0.0
2021-04-16 04:00:00,416.26,415.21,-0.003498,0.002819,-0.002692,-0.003881,0.001754,0.002421,0.002735,-0.004246,-0.001499,-0.003487,-0.018957,0.0
2021-04-19 04:00:00,413.93,412.17,-0.004149,-0.000356,-0.009452,-0.002727,-0.002429,-0.007847,-0.005995,-0.003404,-0.005562,0.002253,-0.001944,0.0


Unnamed: 0_level_0,SPY Open,SPY Close,Industrials,Health Care,Information Technology,Communication Services,Consumer Staples,Consumer Discretionary,Utilities,Financials,Materials,Real Estate,Energy,SPY
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-04-06 04:00:00,445.67,448.77,0.002059,0.011917,-0.003392,-0.005008,0.009086,-0.006456,0.01911,-0.000604,-0.004417,0.014811,-0.010703,1.0
2022-04-07 04:00:00,448.04,447.57,0.005856,0.021049,0.005208,-0.009337,0.008555,0.004695,-0.003217,-0.00148,0.004416,-0.004456,0.002626,0.0
2022-04-08 04:00:00,444.11,439.92,-0.005791,0.005426,-0.007141,0.001863,0.00189,0.004482,0.001318,0.002899,0.00202,0.001696,0.027976,0.0
2022-04-11 04:00:00,443.02,438.29,-0.001636,-0.016449,-0.008222,-0.001345,-0.002333,0.006947,-0.014041,-0.002284,-0.001462,-0.010802,-0.018648,0.0
2022-04-12 04:00:00,437.96,443.31,-0.005875,-0.009887,-0.01583,-0.014729,0.000453,-0.009666,0.006666,-0.009787,-0.003684,0.001091,-0.003254,1.0


## RandomForestClassifier Model

In [15]:
X = sp500[['Industrials', 'Health Care', 'Information Technology', 'Communication Services', 'Consumer Staples', 'Consumer Discretionary', 'Utilities', 'Financials', 'Materials', 'Real Estate', 'Energy']]
y = sp500[['SPY']]


display(X.head())
display(y.head())
# display(sp500.dtypes)

Unnamed: 0_level_0,Industrials,Health Care,Information Technology,Communication Services,Consumer Staples,Consumer Discretionary,Utilities,Financials,Materials,Real Estate,Energy
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-04-13 04:00:00,-0.001641,0.00651,4.5e-05,-0.000767,0.000146,-2e-06,0.016096,-0.004232,-0.00321,0.007684,0.00114
2021-04-14 04:00:00,-0.000804,0.000509,-0.004617,-0.004491,-0.001597,-0.002539,0.007733,0.010694,0.009282,-0.004979,0.020342
2021-04-15 04:00:00,0.000453,0.011926,0.004855,0.003417,0.005316,-0.004535,0.011355,-0.001082,0.004324,0.011913,-0.006888
2021-04-16 04:00:00,-0.003498,0.002819,-0.002692,-0.003881,0.001754,0.002421,0.002735,-0.004246,-0.001499,-0.003487,-0.018957
2021-04-19 04:00:00,-0.004149,-0.000356,-0.009452,-0.002727,-0.002429,-0.007847,-0.005995,-0.003404,-0.005562,0.002253,-0.001944


Unnamed: 0_level_0,SPY
timestamp,Unnamed: 1_level_1
2021-04-13 04:00:00,0.0
2021-04-14 04:00:00,1.0
2021-04-15 04:00:00,0.0
2021-04-16 04:00:00,0.0
2021-04-19 04:00:00,0.0


In [16]:
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

2021-04-13 04:00:00


In [17]:
# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=6)

# Display the training end date
print(training_end)

2021-10-13 04:00:00


In [18]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Display sample data
X_train.head()

Unnamed: 0_level_0,Industrials,Health Care,Information Technology,Communication Services,Consumer Staples,Consumer Discretionary,Utilities,Financials,Materials,Real Estate,Energy
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-04-13 04:00:00,-0.001641,0.00651,4.5e-05,-0.000767,0.000146,-2e-06,0.016096,-0.004232,-0.00321,0.007684,0.00114
2021-04-14 04:00:00,-0.000804,0.000509,-0.004617,-0.004491,-0.001597,-0.002539,0.007733,0.010694,0.009282,-0.004979,0.020342
2021-04-15 04:00:00,0.000453,0.011926,0.004855,0.003417,0.005316,-0.004535,0.011355,-0.001082,0.004324,0.011913,-0.006888
2021-04-16 04:00:00,-0.003498,0.002819,-0.002692,-0.003881,0.001754,0.002421,0.002735,-0.004246,-0.001499,-0.003487,-0.018957
2021-04-19 04:00:00,-0.004149,-0.000356,-0.009452,-0.002727,-0.002429,-0.007847,-0.005995,-0.003404,-0.005562,0.002253,-0.001944


In [19]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

# Display sample data
X_test.head()

Unnamed: 0_level_0,Industrials,Health Care,Information Technology,Communication Services,Consumer Staples,Consumer Discretionary,Utilities,Financials,Materials,Real Estate,Energy
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-10-13 04:00:00,-0.002116,-0.002039,0.003052,-0.001891,-0.001348,-0.001194,0.009721,-0.003668,-0.002619,0.008394,0.010501
2021-10-14 04:00:00,0.012505,0.004434,0.011716,0.005219,0.008504,0.007698,0.008099,0.007248,0.013811,0.006983,-0.001271
2021-10-15 04:00:00,-0.001331,-0.001917,0.00166,-0.005719,-0.008736,-0.006364,-0.004472,-0.000891,-0.007741,-0.00684,-0.009481
2021-10-18 04:00:00,0.00792,-0.001567,0.010961,0.005152,-0.00121,0.01065,-0.003401,0.002837,0.009422,0.007576,-0.00847
2021-10-19 04:00:00,-0.000654,0.005998,0.007174,0.004999,-0.00017,-0.007061,0.003243,0.004149,-0.001142,-0.007184,0.00317


In [20]:
# Create a StandardScaler instance
scaler = StandardScaler()
 
# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)
 
# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [21]:
# Scaled data
rdm_forest_model = RandomForestClassifier(max_depth=10, random_state=None)
rdm_forest_model.fit(X_train_scaled, np.ravel(y_train, order='c'), sample_weight=None)
# rdm_forest_model.fit(X_train_scaled, y_train.values.ravel(), sample_weight=None)

rdm_forest_pred = rdm_forest_model.predict(X_train_scaled)

# Review the model's predicted trained values
print('length rdm_forest_pred', len(rdm_forest_pred))
print('rdm_forest_pred', rdm_forest_pred)

length rdm_forest_pred 129
rdm_forest_pred [0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1.
 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1.
 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.
 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1.
 0. 1. 1. 1. 0. 0. 0. 1. 1.]


In [22]:
display(rdm_forest_model.feature_importances_)
display(sp500.columns)

array([0.07694613, 0.09485798, 0.09424785, 0.09658429, 0.09560509,
       0.09395988, 0.11842473, 0.08370175, 0.07675333, 0.07728007,
       0.0916389 ])

Index(['SPY Open', 'SPY Close', 'Industrials', 'Health Care',
       'Information Technology', 'Communication Services', 'Consumer Staples',
       'Consumer Discretionary', 'Utilities', 'Financials', 'Materials',
       'Real Estate', 'Energy', 'SPY'],
      dtype='object')

In [23]:
y_pred = rdm_forest_model.predict(X_test_scaled)

# Review the model's predicted test values
# print('length y_pred', len(y_pred))
# print('y_pred', y_pred)

In [24]:
#Import scikit-learn metrics module for accuracy calculation

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.48412698412698413


In [25]:
# Review the classification report associated with the model predictions.
# Use a classification report to evaluate the model using the predictions and training data
rfm_testing_pred = rdm_forest_model.predict(X_test_scaled)
rfm_testing_report = classification_report(y_test, rfm_testing_pred)

# Print the classification report
print(rfm_testing_report)

              precision    recall  f1-score   support

         0.0       0.45      0.33      0.38        61
         1.0       0.50      0.63      0.56        65

    accuracy                           0.48       126
   macro avg       0.48      0.48      0.47       126
weighted avg       0.48      0.48      0.47       126



In [26]:
# Create a predictions DataFrame that contains columns for “Predicted” values, “Actual Returns”, and “Strategy Returns”
# Create a predictions DataFrame
predictions_df = pd.DataFrame(index=X_test.index)

# Add the SVM model predictions to the DataFrame
predictions_df['Predicted'] = rfm_testing_pred

predictions_df

# Add the actual returns to the DataFrame
# predictions_df['Actual Returns'] = sp500['Actual Returns']

# # Add the strategy returns to the DataFrame
# predictions_df['Strategy Returns'] = predictions_df['Actual Returns'] * predictions_df['Predicted']

Unnamed: 0_level_0,Predicted
timestamp,Unnamed: 1_level_1
2021-10-13 04:00:00,1.0
2021-10-14 04:00:00,1.0
2021-10-15 04:00:00,1.0
2021-10-18 04:00:00,1.0
2021-10-19 04:00:00,1.0
...,...
2022-04-06 04:00:00,0.0
2022-04-07 04:00:00,1.0
2022-04-08 04:00:00,0.0
2022-04-11 04:00:00,1.0
