In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from pathlib import Path

# Load data
file_path = Path("Resources/sp500_adj_close_raw.csv")
df = pd.read_csv(file_path)

# Extract date features
def extract_date_features(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['Day'] = df['Date'].dt.day
    return df

# Apply date feature extraction
df['Date'] = pd.to_datetime(df['Date'])
df = extract_date_features(df)

# Drop the original date column after feature extraction
X = df.drop(columns=["Return", "Date", "Action"])
y = df["Action"]

# Encode 'Action' into discrete classes
action_mapping = {'buy': 0, 'sell': 1, 'hold': 2, 'short': 3}
y_encoded = y.map(action_mapping)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Preprocess the categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Volatility', 'RSI', 'SMA_50', 'SMA_100', 'SMA_200', 'Upper Band', 'Lower Band', 'Support', 'Resistance', 'Year', 'Month', 'DayOfWeek', 'Day']),
        ('cat', OneHotEncoder(), ['Ticker'])
    ])

# Scale the features using the preprocessor
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# Initialize and train the model
train_scores = []
test_scores = []

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

# Plot the results
plt.plot(range(1, 20, 2), train_scores, marker='o', label="Training Scores")
plt.plot(range(1, 20, 2), test_scores, marker="x", label="Testing Scores")
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Accuracy Score")
plt.title("KNN Classifier Accuracy for Different k Values")
plt.legend()
plt.show()
