In [2]:
import pandas as pd
import numpy as np
import talib as ta
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

data = pd.read_csv("G:\DJI.csv") # load our data
# add features by using talib.   
data['EMA_10'] = ta.MA(data['Close'], timeperiod=10)   
data['RSI'] = ta.RSI(data['Close'], timeperiod=14)
data['ADX'] = ta.ADX(data['High'],data['Low'],data['Close'], timeperiod=14)
data['ATR'] = ta.ATR(data['High'],data['Low'],data['Close'], timeperiod=14)
# calculate daily stock returns from Adj Close
data['Return'] = data['Adj Close'].pct_change()

RISK=[0]
for i in range(1,len(data)):
    a=data.iloc[0:i,-1]
    res=np.std(a)
    RISK = np.row_stack((RISK, res))

data['Risk']=RISK

data=data.dropna() #delete data if data is NA
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_10,RSI,ADX,ATR,Return,Risk
27,2019-01-16,24139.910156,24288.609375,24119.720703,24207.160156,24207.160156,302830000,23749.775781,50.868358,22.020899,476.557611,0.005883,0.017594
28,2019-01-17,24147.089844,24474.460938,24088.900391,24370.099609,24370.099609,288590000,23918.163672,53.13249,20.997556,470.057821,0.006731,0.017317
29,2019-01-18,24534.189453,24750.220703,24459.029297,24706.349609,24706.349609,372970000,24045.482617,57.486487,20.678464,463.633769,0.013798,0.017065
30,2019-01-22,24607.759766,24607.759766,24244.310547,24404.480469,24404.480469,338480000,24132.795703,52.748764,19.763668,463.519861,-0.012218,0.016975
31,2019-01-23,24577.25,24700.980469,24307.169922,24575.619141,24575.619141,318600000,24211.612695,55.012421,19.135162,458.540625,0.007013,0.016829


In [3]:
data.corr()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,EMA_10,RSI,ADX,ATR,Return,Risk
Open,1.0,0.99385,0.989629,0.983312,0.983312,-0.353573,0.942853,0.44398,-0.067412,-0.73166,-0.093427,-0.754057
High,0.99385,1.0,0.990521,0.991632,0.991632,-0.347672,0.94395,0.44435,-0.059481,-0.717131,-0.0224,-0.763799
Low,0.989629,0.990521,1.0,0.993871,0.993871,-0.384515,0.92112,0.508474,-0.082257,-0.732777,0.019604,-0.732493
Close,0.983312,0.991632,0.993871,1.0,1.0,-0.380506,0.930203,0.496752,-0.072084,-0.724317,0.066977,-0.742501
Adj Close,0.983312,0.991632,0.993871,1.0,1.0,-0.380506,0.930203,0.496752,-0.072084,-0.724317,0.066977,-0.742501
Volume,-0.353573,-0.347672,-0.384515,-0.380506,-0.380506,1.0,-0.31817,-0.17048,-0.082653,0.19437,-0.1306,0.351098
EMA_10,0.942853,0.94395,0.92112,0.930203,0.930203,-0.31817,1.0,0.214242,0.016427,-0.756023,-0.130729,-0.830021
RSI,0.44398,0.44435,0.508474,0.496752,0.496752,-0.17048,0.214242,1.0,-0.241828,-0.419535,0.322002,0.097165
ADX,-0.067412,-0.059481,-0.082257,-0.072084,-0.072084,-0.082653,0.016427,-0.241828,1.0,0.03687,-0.051214,-0.21912
ATR,-0.73166,-0.717131,-0.732777,-0.724317,-0.724317,0.19437,-0.756023,-0.419535,0.03687,1.0,0.107609,0.572095


In [4]:
feature_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume','EMA_10','RSI','ADX','ATR']
X = data[feature_cols] # All Features
y = np.where(data['Return']>0,1,0)
y

array([1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1])

In [5]:
# Feature selection method. 
#We use Chi square method which is a filter method based on correlation.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.5,test_size=0.5, random_state=0) #We put 50% in train and 50% in test.
selector = SelectKBest(chi2, k=8) #find best 8 features by using chi2
selector.fit(X_train, y_train)
clf = make_pipeline(MinMaxScaler(), LinearSVC())
clf.fit(X_train, y_train)
print('Chi Square accuracy without selecting features: {:.3f}'.format(clf.score(X_test, y_test)))
clf_selected = make_pipeline(SelectKBest(chi2, k=8), MinMaxScaler(), LinearSVC()) 
clf_selected.fit(X_train, y_train)
print('Chi Square accuracy after feature selection: {:.3f}'.format(clf_selected.score(X_test, y_test)))

Chi Square accuracy without selecting features: 0.690
Chi Square accuracy after feature selection: 0.699


  return self.partial_fit(X, y)


In [6]:
f = selector.fit(X_train, y_train)
dfscores = pd.DataFrame(f.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']
print(featureScores.nlargest(8,'Score'))  #print 8 best features

    features         Score
5     Volume  8.702335e+06
9        ATR  4.583656e+01
6     EMA_10  2.229697e+01
0       Open  1.890570e+01
7        RSI  2.696389e+00
1       High  1.599852e+00
3      Close  9.606326e-01
4  Adj Close  9.606326e-01


In [7]:
# Find accuracy by using Decision Tree method.
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, train_size= 0.5,test_size=0.5, random_state=0) #We put 50% in train and 50% in test.
scaler = MinMaxScaler()
X1_train = scaler.fit_transform(X1_train)
X1_test = scaler.transform(X1_test)
clf1 = DecisionTreeClassifier().fit(X1_train, y1_train)
print('Decision Trees accuracy on training set: {:.3f}'.format(clf1.score(X1_train, y1_train)))
print('Decision Trees accuracy without selecting features on test set: {:.3f}'.format(clf1.score(X1_test, y1_test)))


Decision Trees accuracy on training set: 1.000
Decision Trees accuracy without selecting features on test set: 0.531


  return self.partial_fit(X, y)


In [8]:
new_furture=['Volume','Close','Adj Close','Low','High','EMA_10','Open','RSI']

In [9]:
new_X = data[new_furture] # Data with new Features
new_y = np.where(data['Return']>0,1,0)
X2_train, X2_test, y2_train, y2_test = train_test_split(new_X, new_y, train_size= 0.5,test_size=0.5, random_state=0) #We put 50% in train and 50% in test.
scaler = MinMaxScaler()
X2_train = scaler.fit_transform(X2_train)
X2_test = scaler.transform(X2_test)
clf2 = DecisionTreeClassifier().fit(X2_train, y2_train)
print('Decision Trees accuracy on training set: {:.3f}'.format(clf2.score(X2_train, y2_train)))
print('Decision Trees accuracy after feature selection on test set: {:.3f}'.format(clf2.score(X2_test, y2_test)))


Decision Trees accuracy on training set: 1.000
Decision Trees accuracy after feature selection on test set: 0.611


  return self.partial_fit(X, y)


In [10]:
feature_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume','EMA_10','RSI','ADX','ATR']
X = data[feature_cols] # All Features
z = np.where(data['Risk']>0.01,1,0)
X_train, X_test, z_train, z_test = train_test_split(X, z, train_size= 0.5,test_size=0.5, random_state=0)
f = selector.fit(X_train, z_train)
dfscores = pd.DataFrame(f.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']
print(featureScores.nlargest(8,'Score'))  #print 8 best features

    features         Score
5     Volume  1.873923e+08
2        Low  1.960050e+03
0       Open  1.951911e+03
6     EMA_10  1.933993e+03
3      Close  1.885471e+03
4  Adj Close  1.885471e+03
1       High  1.870951e+03
9        ATR  1.931385e+02
