In [35]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, r2_score
from pathlib import Path
from datetime import date
import pandas as pd
import numpy as np

In [13]:
# Import data
file_path = Path("Resources/sp500_adj_close_raw.csv")
df = pd.read_csv(file_path)
df["Date"] = pd.to_datetime(df["Date"])
df.shape
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979679 entries, 0 to 1979678
Data columns (total 14 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Date            datetime64[ns]
 1   Ticker          object        
 2   Adjusted Close  float64       
 3   Return          float64       
 4   Volatility      float64       
 5   RSI             float64       
 6   SMA_50          float64       
 7   SMA_100         float64       
 8   SMA_200         float64       
 9   Upper Band      float64       
 10  Lower Band      float64       
 11  Support         float64       
 12  Resistance      float64       
 13  Action          object        
dtypes: datetime64[ns](1), float64(11), object(2)
memory usage: 211.5+ MB


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2008-01-02,A,23.256384,-0.009918,0.015705,48.827618,23.314175,23.299887,23.564934,24.72725,22.540232,21.392035,24.351929,short
1,2008-01-02,AAPL,5.876342,0.000462,0.018937,59.067432,5.518483,4.939064,4.19763,6.135834,5.403559,4.637376,6.026839,buy
2,2008-01-02,ABT,18.130205,-0.006092,0.010484,34.677586,18.138458,17.62825,17.709028,19.233109,18.221804,16.775562,19.13401,short
3,2008-01-02,ACGL,7.608889,0.020444,0.016022,45.15419,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778,buy
4,2008-01-02,ACN,26.437078,-0.017194,0.024039,54.812183,26.577982,27.78442,28.471031,28.227205,24.273773,24.765505,29.215664,sell


In [16]:
# Drop rows with todays date to remove potentail infill bias
today = date.today()
filter_data_by_date = df["Date"].dt.date == today  # Use .dt.date to compare only the date part

#Create a new DF with todays data
todays_data = df[filter_data_by_date].reset_index(drop=True)
print("Shape:", todays_data.shape)

# Display dataframe
display(todays_data.tail())

Shape: (0, 14)


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action


In [19]:
# Create a new DF with historical data (excluding today's data)
historical_data = df[~filter_data_by_date]

# Set the 'Date' column as the index
historical_data.set_index('Date', inplace=True)

# Assign the filtered DataFrame back to df
df = historical_data

print("Shape:", df.shape)

# Display the last few rows of the historical data
display(df.tail())

Shape: (1979679, 13)


Unnamed: 0_level_0,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2024-10-23,XYL,131.74,0.002817,0.009287,45.930984,133.56926,134.55832,130.6566,137.97014,131.13087,126.71,137.53,buy
2024-10-23,YUM,134.01,0.004874,0.010931,33.05073,134.76096,133.57689,133.94536,140.49457,130.85544,129.71,139.92,buy
2024-10-23,ZBH,104.68,0.004028,0.010726,51.86028,107.59999,108.161766,115.56048,108.536896,101.40522,101.77,115.91237,buy
2024-10-23,ZBRA,368.08,-0.010538,0.00987,55.442924,354.9792,335.7742,309.98975,378.89667,362.71933,320.77,377.68,short
2024-10-23,ZTS,188.99,-0.002744,0.010509,45.437954,189.215,183.13512,179.39548,196.47697,186.50803,180.9,196.48,sell


In [22]:
# Initialize the LabelEncoder and fit it to the Ticker column:

le_ticker = LabelEncoder()
le_action = LabelEncoder()
df['Ticker'] = le_ticker.fit_transform(df['Ticker'])
df['Action'] = le_action.fit_transform(df['Action'])

print(df)

            Ticker  Adjusted Close    Return  Volatility        RSI  \
Date                                                                  
2008-01-02       0       23.256384 -0.009918    0.015705  48.827618   
2008-01-02       1        5.876342  0.000462    0.018937  59.067432   
2008-01-02       4       18.130205 -0.006092    0.010484  34.677586   
2008-01-02       5        7.608889  0.020444    0.016022  45.154190   
2008-01-02       6       26.437078 -0.017194    0.024039  54.812183   
...            ...             ...       ...         ...        ...   
2024-10-23     496      131.740000  0.002817    0.009287  45.930984   
2024-10-23     497      134.010000  0.004874    0.010931  33.050730   
2024-10-23     498      104.680000  0.004028    0.010726  51.860280   
2024-10-23     499      368.080000 -0.010538    0.009870  55.442924   
2024-10-23     500      188.990000 -0.002744    0.010509  45.437954   

                SMA_50     SMA_100     SMA_200  Upper Band  Lower Band  \
Da

In [24]:
# Split the data into training and testing sets
# Get the target variable (the "Occupancy" column)
y = df["Return"]#.values.reshape(-1,1)
print("Shape:", y.shape)

Shape: (1979679,)


In [26]:
# Get the features (everything except the "Occupancy" column)
X = df.copy()
# X = X.drop(columns=["Return", "Date", "Ticker", "Action"])
X = X.drop(columns=["Return"])
print("Shape:", X.shape)

Shape: (1979679, 12)


In [28]:
# Split the data into training and testing sets 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1583743, 12)
Shape of X_test: (395936, 12)
Shape of y_train: (1583743,)
Shape of y_test: (395936,)


In [30]:
# Preprocess the categorical and numerical features
preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [37]:
# Prepare the models to evaluate
models = {
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42),
    'SVR': SVR()
}

In [None]:
# Perform Grid Search for each model
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    if model_name == 'KNN':
        param_grid = {'regressor__n_neighbors': range(1, 20, 2)}
    elif model_name == 'RandomForest':
        param_grid = {'regressor__n_estimators': [50, 100, 200],
                      'regressor__max_depth': [None, 10, 20]}
    elif model_name == 'GradientBoosting':
        param_grid = {'regressor__n_estimators': [50, 100, 200],
                      'regressor__learning_rate': [0.01, 0.1, 0.2]}
    elif model_name == 'XGBoost':
        param_grid = {'regressor__n_estimators': [50, 100, 200],
                      'regressor__learning_rate': [0.01, 0.1, 0.2],
                      'regressor__max_depth': [3, 5, 7]}
    elif model_name == 'SVR':
        param_grid = {'regressor__C': [0.1, 1, 10],
                      'regressor__kernel': ['linear', 'poly', 'rbf']}
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)
    
    # Best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f"{model_name}: Best Params: {best_params}, Best Cross-Validation Score: {best_score:.3f}")
    
    # Evaluate the model with the best parameters
    best_model = grid_search.best_estimator_
    train_score = best_model.score(X_train, y_train)
    test_score = best_model.score(X_test, y_test)
    print(f"{model_name}: Train Score: {train_score:.3f}, Test Score: {test_score:.3f}")


In [None]:
# Plot the results for the best model (based on Cross-Validation Score)
best_model_name = max(models, key=lambda name: models[name].score(X_test_scaled, y_test))
best_model = models[best_model_name]
plt.plot(range(1, 20, 2), train_scores, marker='o', label="Training Scores")
plt.plot(range(1, 20, 2), test_scores, marker="x", label="Testing Scores")
plt.xlabel("k neighbors")
plt.ylabel("r^2")
plt.legend()
plt.title(f"{best_model_name} Accuracy for Different k Values")
plt.show()