In [19]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from pathlib import Path
from sklearn.impute import SimpleImputer

In [3]:
# Import data
file_path = Path("Resources/test_w_na.csv")
df = pd.read_csv(file_path)
# Convert date to datetime data type
df["Date"] = pd.to_datetime(df["Date"])
df.set_index('Date', inplace=True)
df.shape
df.tail()

Unnamed: 0_level_0,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2024-10-25,XYL,130.41,,130.65,-0.001837,0.009297,39.056873,133.51048,134.41891,130.8492,138.39948,129.94153,126.71,137.53,
2024-10-25,YUM,133.04,,133.16,-0.000901,0.006858,34.395523,134.61107,133.44423,134.0083,139.50783,130.66417,129.71,139.92,
2024-10-25,ZBH,102.33,,104.0,-0.016058,0.01061,50.197327,107.36608,107.9697,115.37445,107.723175,101.30882,101.77,115.91237,
2024-10-25,ZBRA,359.97,,362.05,-0.005745,0.009788,43.769238,355.8908,336.8905,311.04135,380.01144,359.82355,320.77,377.68,
2024-10-25,ZTS,179.91,,181.5,-0.00876,0.012576,36.496883,189.094,183.3149,179.2437,197.88783,182.27017,179.91,196.48,


In [4]:
# Drop rows with NaN in 'Next Day Close'
df = df.dropna(subset=['Next Day Close'])
df.tail()

Unnamed: 0_level_0,Ticker,Adjusted Close,Next Day Close,Previous Day Close,Return,Volatility,RSI,SMA_50,SMA_100,SMA_200,Upper Band,Lower Band,Support,Resistance,Action
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2024-10-24,XYL,130.65,130.41,131.72,-0.008123,0.009364,41.637012,133.55902,134.4822,130.75377,138.21664,130.53435,126.71,137.53,short
2024-10-24,YUM,133.16,133.04,134.02,-0.006417,0.010962,32.3681,134.69278,133.5222,133.97957,140.23546,130.62454,129.71,139.92,short
2024-10-24,ZBH,104.0,102.33,104.7,-0.006686,0.010517,47.462685,107.51261,108.07578,115.47507,108.18526,101.41174,101.77,115.91237,short
2024-10-24,ZBRA,362.05,359.97,368.09,-0.016409,0.010532,44.39913,355.5556,336.3185,310.51785,379.3288,361.3692,320.77,377.68,short
2024-10-24,ZTS,181.5,179.91,188.99,-0.039632,0.013328,35.00878,189.1774,183.23051,179.33058,197.27248,184.37552,180.9,196.48,short


In [21]:
df_clean = df.dropna()

In [23]:
# Define features (X) and target (y)
X = df_clean.drop(columns=['Next Day Close', 'Ticker', 'Action'])
y = df_clean['Next Day Close']

In [25]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Scale the data using standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
# Create KNN regression model
knn = KNeighborsRegressor()


In [31]:
# Fit the model
knn.fit(X_train_scaled, y_train)

In [33]:
# Make predictions
y_pred = knn.predict(X_test_scaled)


In [35]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [37]:
print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R-squared:', r2)

Mean Absolute Error: 2.3314645132348946
Mean Squared Error: 30.799548032384507
R-squared: 0.9992100761596074


In [41]:
# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Create GridSearchCV object
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}


In [45]:
# Get the best model
best_knn = grid_search.best_estimator_
# Make final predictions on the test set
final_predictions = best_knn.predict(X_test_scaled)

In [47]:
# Display final predictions
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': final_predictions})
print(predictions_df)

               Actual  Predicted
Date                            
2023-04-03  81.373910  80.421031
2010-08-24  17.004652  17.285588
2022-08-10  96.575066  93.743052
2017-10-10  54.291428  54.233566
2009-11-06  78.234310  76.039661
...               ...        ...
2010-07-12  31.008406  30.068058
2010-04-26  31.837200  32.178479
2011-01-19  20.599249  19.054517
2013-09-12  25.092500  24.186095
2020-12-16  52.828434  51.677488

[395480 rows x 2 columns]


In [49]:
# Create predictions DataFrame
predictions_df = pd.DataFrame({'Ticker': tickers_test, 'Date': dates_test, 'Actual': y_test, 'Predicted': final_predictions})

NameError: name 'tickers_test' is not defined