In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_ta as ta
import yfinance as yf

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

In [4]:
with open("sp500_list.txt", "r") as f:
    sp500_list = [line.strip() for line in f]

In [5]:
rfdata = pd.read_csv("market_data.csv", index_col=0, low_memory=False)
rfdata = rfdata.drop(rfdata.index[[0,1]])
rfdata.columns = pd.MultiIndex.from_arrays([["Adj Close"]*503 + ["Volume"]*503, sp500_list*2])
rfdata.index = pd.core.indexes.datetimes.DatetimeIndex(rfdata.index)
rfdata = rfdata.astype(float)

Create rel_list for the iterations and feature names for (X,y) values of the model

In [6]:
rel_list = list(rfdata.dropna(axis=1)["Adj Close"].columns)
feature_names = ['Ret-1', 'ema14', 'rsi14', 'ema30', 'rsi30', 'ema50', 'rsi50', 'ema200', 'rsi200', "pvt", "vol"]

In [7]:
import warnings
warnings.filterwarnings("ignore", message=".*Numerical issues were encountered.*")

In [8]:
%%time

score_train = []
score_test = []
df_pred = pd.DataFrame(columns=rel_list)

for i in rel_list:
    frame = pd.DataFrame({'Adj Close': rfdata["Adj Close"][i], 'Volume': rfdata["Volume"][i]})
    frame["Return"] = frame["Adj Close"].pct_change()
    frame['Ret-1'] = frame['Return'].shift(1)
    for n in [14, 30, 50, 200]:
        frame['ema' + str(n)] = ta.sma(close=frame['Adj Close'].shift(1), length=n) / frame['Adj Close'].shift(1)
        frame['rsi' + str(n)] = ta.rsi(close=frame['Adj Close'].shift(1), length=n) / frame['Adj Close'].shift(1)
    frame["pvt"] = ta.pvt(close=frame['Adj Close'].shift(1), volume=frame['Volume'].shift(1))
    frame['Vol-1'] = frame['Volume'].shift(1)
    frame = frame.dropna()
    frame['Vol-1_SMA5'] = ta.sma(close=frame['Vol-1'], length=5)
    frame = frame.dropna()
    frame["vol"] = frame["Vol-1"] / frame["Vol-1_SMA5"]

    features = frame[feature_names]
    targets = frame['Return']
    train_features = features["2013-01-01":"2022-12-30"]
    train_targets = targets["2013-01-01":"2022-12-30"]
    test_features = features["2022-12-30":]
    test_targets = targets["2022-12-30":]
    

    features = frame[feature_names]
    targets = frame['Return']
    train_features =features["2013-01-01":"2022-12-30"]
    train_targets = targets["2013-01-01":"2022-12-30"]
    test_features = features["2022-12-30":]
    test_targets = targets["2022-12-30":]
    
    rfr = RandomForestRegressor(n_estimators=20, max_depth=4, min_samples_split=20, min_samples_leaf=4)
    fit = rfr.fit(scale(train_features), train_targets)
    
    score_train.append(fit.score(scale(train_features), train_targets))
    score_test.append(fit.score(scale(test_features), test_targets))
    df_pred[i] = list(fit.predict(scale(test_features)))
    
    

    
df_score = pd.DataFrame({"score_train" : score_train, "score_test" : score_test}, index=rel_list)
    

CPU times: total: 5min 19s
Wall time: 5min 36s


In [9]:
df_score.mean()

score_train    0.085636
score_test    -0.059111
dtype: float64

In [9]:
#df_score.to_csv("RF1score.csv")
#df_pred.to_csv("RF1pred.csv")

Loading the predictions

# KNN

In [12]:
from sklearn.neighbors import KNeighborsRegressor

In [13]:
feature_names

['Ret-1',
 'ema14',
 'rsi14',
 'ema30',
 'rsi30',
 'ema50',
 'rsi50',
 'ema200',
 'rsi200',
 'pvt',
 'vol']

In [14]:
%%time

score_train_knn = []
score_test_knn = []
knn_pred = pd.DataFrame(columns=rel_list)

for i in rel_list:
    frame = pd.DataFrame({'Adj Close': rfdata["Adj Close"][i], 'Volume': rfdata["Volume"][i]})
    frame["Return"] = frame["Adj Close"].pct_change()
    frame['Ret-1'] = frame['Return'].shift(1)
    for n in [14, 30, 50, 200]:
        frame['ema' + str(n)] = ta.sma(close=frame['Adj Close'].shift(1), length=n) / frame['Adj Close'].shift(1)
        frame['rsi' + str(n)] = ta.rsi(close=frame['Adj Close'].shift(1), length=n) / frame['Adj Close'].shift(1)
    frame["pvt"] = ta.pvt(close=frame['Adj Close'].shift(1), volume=frame['Volume'].shift(1))
    frame['Vol-1'] = frame['Volume'].shift(1)
    frame = frame.dropna()
    frame['Vol-1_SMA5'] = ta.sma(close=frame['Vol-1'], length=5)
    frame = frame.dropna()
    frame["vol"] = frame["Vol-1"] / frame["Vol-1_SMA5"]
   

    features = frame[feature_names]
    targets = frame['Return']
    train_features =features["2013-01-01":"2022-12-30"]
    train_targets = targets["2013-01-01":"2022-12-30"]
    test_features = features["2022-12-30":]
    test_targets = targets["2022-12-30":]
    
    knn = KNeighborsRegressor(n_neighbors=10)
    fit_knn = knn.fit(scale(train_features), train_targets)
    
    score_train_knn.append(fit_knn.score(scale(train_features), train_targets))
    score_test_knn.append(fit_knn.score(scale(test_features), test_targets))
    knn_pred[i] = list(fit.predict(scale(test_features)))
    
    

    
knn_score = pd.DataFrame({"score_train" : score_train_knn, "score_test" : score_test_knn}, index=rel_list)

CPU times: total: 1min 33s
Wall time: 1min 42s


In [13]:
#knn_score.to_csv("KNN1score.csv")
#knn_pred.to_csv("KNN1pred.csv")

In [16]:
knn_score

Unnamed: 0,score_train,score_test
A,0.114099,-0.082259
AAL,0.116776,-0.052531
AAPL,0.128821,-0.321000
ABT,0.109645,-0.166055
ACGL,0.104678,-0.009900
...,...,...
XOM,0.113652,-0.077421
XYL,0.136803,-0.108540
YUM,0.109913,-0.096093
ZBH,0.082112,-0.100816
