In [7]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from datetime import date
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [5]:
# Import data
file_path = Path("Resources/sp500_adj_close_raw.csv")
df = pd.read_csv(file_path)
df["Date"] = pd.to_datetime(df["Date"])
df.shape

# Drop rows with today's date to remove potential infill bias
today = date.today()
filter_data_by_date = df["Date"].dt.date == today
todays_data = df[filter_data_by_date].reset_index(drop=True)
historical_data = df[~filter_data_by_date]
df = historical_data
df.head()

Unnamed: 0,Date,Ticker,Adjusted Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
0,2008-01-02,A,23.256384,-0.009918,0.015705,48.827618,23.314175,23.299887,23.564934,24.72725,22.540232,21.392035,24.351929,short
1,2008-01-02,AAPL,5.876342,0.000462,0.018937,59.067432,5.518483,4.939064,4.19763,6.135834,5.403559,4.637376,6.026839,buy
2,2008-01-02,ABT,18.130205,-0.006092,0.010484,34.677586,18.138458,17.62825,17.709028,19.233109,18.221804,16.775562,19.13401,short
3,2008-01-02,ACGL,7.608889,0.020444,0.016022,45.15419,7.785511,7.878933,7.874161,8.114465,7.378535,7.463333,8.307778,buy
4,2008-01-02,ACN,26.437078,-0.017194,0.024039,54.812183,26.577982,27.78442,28.471031,28.227205,24.273773,24.765505,29.215664,sell


In [None]:
# Feature Engineering
def extract_date_features(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['Day'] = df['Date'].dt.day
    return df

df = extract_date_features(df)

# Drop the original date column after feature extraction
X = df.drop(columns=["Return", "Date", "Ticker", "Action"])
y = df["Return"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Scale the features
X_scaler = StandardScaler()
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# KNN with Cross-Validation
param_grid = {'n_neighbors': range(1, 20, 2)}
knn = KNeighborsRegressor()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

# Best parameters and score
best_k = grid_search.best_params_['n_neighbors']
best_score = grid_search.best_score_
print(f"Best k: {best_k}, Best cross-validation score: {best_score:.3f}")

# Train and evaluate the model with the best k
knn_best = KNeighborsRegressor(n_neighbors=best_k)
knn_best.fit(X_train_scaled, y_train)
train_score = knn_best.score(X_train_scaled, y_train)
test_score = knn_best.score(X_test_scaled, y_test)
print(f"Train Score: {train_score:.3f}, Test Score: {test_score:.3f}")

# Plot the results
train_scores = [knn_best.score(X_train_scaled, y_train)]
test_scores = [knn_best.score(X_test_scaled, y_test)]
plt.plot([best_k], train_scores, marker='o', label="Training Score")
plt.plot([best_k], test_scores, marker='x', label="Testing Score")
plt.xlabel("k neighbors")
plt.ylabel("r^2")
plt.legend()
plt.title("KNN Regressor Accuracy for Best k Value")
plt.show()