In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from datetime import date
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [2]:
# Import data
file_path = Path("Resources/sp500_adj_close_raw.csv")
df = pd.read_csv(file_path)
df["Date"] = pd.to_datetime(df["Date"])
df.shape
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979679 entries, 0 to 1979678
Data columns (total 14 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Date            datetime64[ns]
 1   Ticker          object        
 2   Adjusted Close  float64       
 3   Return          float64       
 4   Volatility      float64       
 5   RSI             float64       
 6   SMA_50          float64       
 7   SMA_100         float64       
 8   SMA_200         float64       
 9   Upper Band      float64       
 10  Lower Band      float64       
 11  Support         float64       
 12  Resistance      float64       
 13  Action          object        
dtypes: datetime64[ns](1), float64(11), object(2)
memory usage: 211.5+ MB


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2008-01-02,A,23.256384,-0.009918,0.015705,48.827618,23.314175,23.299887,23.564934,24.72725,22.540232,21.392035,24.351929,short
1,2008-01-02,AAPL,5.876342,0.000462,0.018937,59.067432,5.518483,4.939064,4.19763,6.135834,5.403559,4.637376,6.026839,buy
2,2008-01-02,ABT,18.130205,-0.006092,0.010484,34.677586,18.138458,17.62825,17.709028,19.233109,18.221804,16.775562,19.13401,short
3,2008-01-02,ACGL,7.608889,0.020444,0.016022,45.15419,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778,buy
4,2008-01-02,ACN,26.437078,-0.017194,0.024039,54.812183,26.577982,27.78442,28.471031,28.227205,24.273773,24.765505,29.215664,sell


In [3]:
# Drop rows with todays date to remove potentail infill bias
today = date.today()
filter_data_by_date = df["Date"].dt.date == today  # Use .dt.date to compare only the date part

#Create a new DF with todays data
todays_data = df[filter_data_by_date].reset_index(drop=True)
print("Shape:", todays_data.shape)

# Display dataframe
display(todays_data.tail())

Shape: (0, 14)


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action


In [4]:
# Create a new DF with historical data (excluding today's data)
historical_data = df[~filter_data_by_date]
df= historical_data
print("Shape:", df.shape)
display(df.tail())

Shape: (1979679, 14)


Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
1979674,2024-10-23,XYL,131.74,0.002817,0.009287,45.930984,133.56926,134.55832,130.6566,137.97014,131.13087,126.71,137.53,buy
1979675,2024-10-23,YUM,134.01,0.004874,0.010931,33.05073,134.76096,133.57689,133.94536,140.49457,130.85544,129.71,139.92,buy
1979676,2024-10-23,ZBH,104.68,0.004028,0.010726,51.86028,107.59999,108.161766,115.56048,108.536896,101.40522,101.77,115.91237,buy
1979677,2024-10-23,ZBRA,368.08,-0.010538,0.00987,55.442924,354.9792,335.7742,309.98975,378.89667,362.71933,320.77,377.68,short
1979678,2024-10-23,ZTS,188.99,-0.002744,0.010509,45.437954,189.215,183.13512,179.39548,196.47697,186.50803,180.9,196.48,sell


In [5]:
# Include the 'Ticker' column in X
X = df.drop(columns=["Return", "Date", "Action"])
y = df["Action"]


In [6]:
# Define bins for classification
bins = [-np.inf, -0.01, 0.01, np.inf]
labels = ['Sell', 'Hold', 'Buy', 'Short']
y_binned = pd.cut(y, bins=bins, labels=labels)

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_binned, test_size=0.8, random_state=42)

In [8]:
# Preprocess the categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Volatility', 'RSI', 'SMA_50', 'SMA_100', 'SMA_200', 'Upper Band', 'Lower Band', 'Support', 'Resistance']),
        ('cat', OneHotEncoder(), ['Ticker'])
    ])

# Scale the features using the preprocessor
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [9]:
# Create a pipeline that includes the preprocessing and the KNN model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

In [10]:
# Train the model
model.fit(X_train, y_train)

In [None]:
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

# Plot the results
plt.plot(range(1, 20, 2), train_scores, marker='o', label="Training Scores")
plt.plot(range(1, 20, 2), test_scores, marker="x", label="Testing Scores")
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Accuracy Score")
plt.title("KNN Classifier Accuracy for Different k Values")
plt.legend()
plt.show()

In [None]:
# # Evaluate the model
# train_score = model.score(X_train, y_train)
# test_score = model.score(X_test, y_test)
# print(f"Train Score: {train_score}")
# print(f"Test Score: {test_score}")

In [None]:
# columns_to_encode = ["Ticker"]

# # Create an instance of OneHotEncoder()
# enc = OneHotEncoder(handle_unknown='ignore')

# # Fit the encoder to the data
# enc.fit(df[columns_to_encode])

# # Transform the data
# df_ohe = enc.transform(df[columns_to_encode])

# # Default output is sparse matrix
# df_ohe

In [None]:
# # Get new feature names
# enc.get_feature_names_out()

In [None]:
# # Set up the OneHotEncoder so it will transform to Pandas
# ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# ohe.set_output(transform="pandas")

# # Fit and transform the OneHotEncoder to the columns to encode
# df_ohe = ohe.fit_transform(df[columns_to_encode])
# df_ohe.head()

In [None]:
# Split the data into training and testing sets
# Get the target variable (the "Occupancy" column)
y = df_ohe["Action"]#.values.reshape(-1,1)
print("Shape:", y.shape)

In [None]:
# Get the features (everything except the "Occupancy" column)
X = df.copy()
X = X.drop(columns="Date")
print("Shape:", X.shape)

In [None]:
# Split the data into training and testing sets 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
X_train.dtypes

In [None]:
# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler()
X_scaler.fit(X_train)

In [None]:
# Transform the training and testing data by using the X_scaler model
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
# Plot the results
plt.plot(range(1, 20, 2), train_scores, marker='o', label="training scores")
plt.plot(range(1, 20, 2), test_scores, marker="x", label="testing scores")
plt.xlabel("k neighbors")
plt.ylabel("Accuracy")
plt.legend()
plt.show()