In [1]:
import pandas as pd
import numpy as np  
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from stock_trading_util import download_price, read_price, line_plot, add_bbvalue, add_so, add_rtrn, add_rsi, calc_bt_pl

  from numpy.core.umath_tests import inner1d


In [2]:
#df_prc = download_price('SPY', '2010-01-01', '2019-12-31')
df_prc = read_price('SPY', '2010-01-01', '2020-06-30')
#line_plot(df_prc, 'SPY')

In [3]:
df_prc = add_bbvalue(df_prc.loc[:,['Date', 'SPY']], 'SPY')
df_prc['bbvalue20'] = df_prc.bbvalue20 / 100.0
df_prc = add_so(df_prc, 'SPY')
df_prc['so20'] = (df_prc.so20 - 0.5) / 10.0
df_prc = add_rsi(df_prc, 'SPY')
df_prc['rsi14'] = (df_prc.rsi14 - 0.5) / 10.0
df_prc = add_rtrn(df_prc, 'SPY', 'b', 20)
df_prc = add_rtrn(df_prc, 'SPY', 'b', 5)
df_prc = add_rtrn(df_prc, 'SPY', 'b', 1)
df_prc = add_rtrn(df_prc, 'SPY', 'f', 1)

In [4]:
df_prc['holding'] = 0
df_prc.loc[df_prc.rtrn_f_1d>0.002, 'holding'] = 1
df_prc.loc[df_prc.rtrn_f_1d<-0.002, 'holding'] = -1
df_prc.drop(columns=['rtrn_f_1d'], inplace=True)

In [5]:
df_cnt = df_prc.groupby('holding')['holding'].count()
df_cnt

holding
-1     818
 0     720
 1    1102
Name: holding, dtype: int64

In [6]:
X = df_prc.iloc[2016:2516, 2:-1].values
y = df_prc.iloc[2016:2516, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
#X_train = df_prc.iloc[20:2516, 2:-1].values
#X_test = df_prc.iloc[-120:-1, 2:-1].values
#y_train = df_prc.iloc[20:2516, -1].values
#y_test = df_prc.iloc[-120:-1, -1].values
def X_y_score(idx):
    return df_prc.iloc[idx:-1, 2:-1].values, df_prc.iloc[idx:-1, -1].values, df_prc.iloc[idx:-1]
X_score, y_score, df_score = X_y_score(2516)

In [7]:
clf = RandomForestClassifier(max_depth=10, n_estimators=20, n_jobs=2, random_state=100)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=2, oob_score=False, random_state=100,
            verbose=0, warm_start=False)

In [8]:
pd.crosstab(y_train, clf.predict(X_train), rownames=['Actual pstn'], colnames=['Predicted pstn'])

Predicted pstn,-1,0,1
Actual pstn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,112,0,8
0,1,104,3
1,0,1,171


In [9]:
pd.crosstab(y_test, clf.predict(X_test), rownames=['Actual pstn'], colnames=['Predicted pstn'])

Predicted pstn,-1,0,1
Actual pstn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,6,6,18
0,6,7,14
1,13,9,21


In [10]:
pd.crosstab(y_score, clf.predict(X_score), rownames=['Actual pstn'], colnames=['Predicted pstn'])

Predicted pstn,-1,0,1
Actual pstn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,14,14,21
0,5,5,4
1,9,11,40


In [11]:
df = pd.DataFrame({'Actual': y_score, 'pstn': clf.predict(X_score), 'price': df_score.SPY})
pl_buyhold, pl_trade = calc_bt_pl(df)
print('Buy and hold returns ${} for $1 investment.'.format(pl_buyhold))
print('Random forest based trading returns ${} for $1 investment.'.format(pl_trade))

Buy and hold returns $0.9331 for $1 investment.
Random forest based trading returns $1.9822 for $1 investment.
