In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_ta as ta
import yfinance as yf

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import r2_score

In [3]:
with open("sp500_list.txt", "r") as f:
    sp500_list = [line.strip() for line in f]

In [4]:
rfdata = pd.read_csv("market_data.csv", index_col=0, low_memory=False)
rfdata = rfdata.drop(rfdata.index[[0,1]])
rfdata.columns = pd.MultiIndex.from_arrays([["Adj Close"]*503 + ["Volume"]*503, sp500_list*2])
rfdata.index = pd.core.indexes.datetimes.DatetimeIndex(rfdata.index)
rfdata = rfdata.astype(float)

In [7]:
rel_list = list(rfdata.dropna(axis=1)["Adj Close"].columns)

feature_names = ['Ret-1',
 'Ret-2',
 'Ret-3',
 'Ret-4',
 'Ret-5',
 'Vol-1',
 'Vol-2',
 'Vol-3',
 'Vol-4',
 'Vol-5']

In [8]:
import warnings
warnings.filterwarnings("ignore", message=".*Numerical issues were encountered when centering the data.*")

In [9]:
%%time

score_train = []
score_test = []

df_pred = pd.DataFrame(columns=rel_list)

for i in rel_list:
    frame = pd.DataFrame({'Adj Close': rfdata["Adj Close"][i], 'Volume': rfdata["Volume"][i]})
    
    frame["Return"] = frame["Adj Close"].pct_change()
    
    for n in range(1,6):

        frame['Ret-' + str(n)] = frame["Return"].shift(n)
        frame["Vol-" + str(n)] = frame["Volume"].shift(n)
    
    frame = frame.dropna()
   

    features = frame[feature_names]
    targets = frame['Return']
    train_features =features["2013-01-01":"2022-12-31"]
    train_targets = targets["2013-01-01":"2022-12-31"]
    test_features = features["2022-12-31":]
    test_targets = targets["2022-12-31":]
    
    rfr = RandomForestRegressor(n_estimators=20, max_depth=4, min_samples_split=20, min_samples_leaf=4)
    fit = rfr.fit(scale(train_features), train_targets)
    
    score_train.append(fit.score(scale(train_features), train_targets))
    score_test.append(fit.score(scale(test_features), test_targets))
    
    df_pred[i] = list(fit.predict(scale(test_features)))
    
    

    
df_score = pd.DataFrame({"score_train" : score_train, "score_test": score_test}, index=rel_list)
    

CPU times: total: 4min 31s
Wall time: 4min 34s


In [9]:
#df_score.to_csv("RF2score.csv")
#df_pred.to_csv("RF2pred.csv")

In [11]:
df_score.mean()

score_train    0.079621
score_test    -0.028297
dtype: float64

In [12]:
from sklearn.neighbors import KNeighborsRegressor

In [13]:
%%time

score_train_knn = []
score_test_knn = []

knn_pred = pd.DataFrame(columns=rel_list)

for i in rel_list:
    frame = pd.DataFrame({'Adj Close': rfdata["Adj Close"][i], 'Volume': rfdata["Volume"][i]})
    
    frame["Return"] = frame["Adj Close"].pct_change()
    
    for n in range(1,6):

        frame['Ret-' + str(n)] = frame["Return"].shift(n)
        frame["Vol-" + str(n)] = frame["Volume"].shift(n)
    
    frame = frame.dropna()
   

    features = frame[feature_names]
    targets = frame['Return']
    train_features =features["2013-01-01":"2022-12-30"]
    train_targets = targets["2013-01-01":"2022-12-30"]
    test_features = features["2022-12-30":]
    test_targets = targets["2022-12-30":]
    
    knn = KNeighborsRegressor(n_neighbors=20)
    fit_knn = knn.fit(scale(train_features), train_targets)
    
    score_train_knn.append(fit_knn.score(scale(train_features), train_targets))
    score_test_knn.append(fit_knn.score(scale(test_features), test_targets))
    knn_pred[i] = list(fit.predict(scale(test_features)))
    
    

knn_score = pd.DataFrame({"score_train" : score_train_knn, "score_test": score_test_knn}, index=rel_list)

CPU times: total: 1min 59s
Wall time: 2min 3s


In [10]:
#knn_score.to_csv("KNN2score.csv")
#knn_pred.to_csv("KNN2pred.csv")

In [15]:
knn_score.mean()

score_train    0.061398
score_test    -0.052859
dtype: float64