## Creating train and test data for a Decision Tree Classfier Model

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from pandas_datareader import data as wb
import talib
%matplotlib widget

In [43]:
# Daily OLHCV data for the S&P 500 etf SPY
df1 = pd.DataFrame()
df1 = wb.DataReader('SPY', data_source='yahoo', start='2016-01-01')

In [44]:
# Create dataframe with SPY daily close price
df = pd.DataFrame()
df['SPY'] = df1['Close']

# Compute and add indicators to the dataframe
df['pct_chg_1'] = df['SPY'].diff()
df['pct_chg_5'] = df['SPY'].diff(periods=5)
df['diffSPY'] = df['SPY'] - df['SPY'].rolling(window = 5).mean()
df['VIX'] = wb.DataReader('^VIX', data_source='yahoo', start='2016-01-01')['Close']
df['RSI_7'] = talib.RSI(df.iloc[:, 1].values, timeperiod=7)
df['RSI_14'] = talib.RSI(df.iloc[:, 1].values)
df['diffRSI'] = df['RSI_7'] - df['RSI_7'].rolling(window = 14).mean()
df['MFI_7'] = talib.MFI(df1['High'], df1['Low'], df1['Close'], df1['Volume'], timeperiod=7)
df['MFI_14'] = talib.MFI(df1['High'], df1['Low'], df1['Close'], df1['Volume'], timeperiod=14)
df['signal'] = 0

In [45]:
# Truncate NaN values created while computing indicators and reset to a numerical index
df = df[21:]
df = df.reset_index()

In [46]:
# Set buy signals when a day's close price is lower than the past 5 and next 5 days
def buy_signals():
    for i in range(5, len(df)-5):
        if df['SPY'][i] == df['SPY'][i-5:i+5].min():
            df.at[i, 'signal'] = 1
            i += 4

# Set sell signals at the highest close price between 2 subsequent buy signals
def sell_signals():
    for n in range(len(buys)-1):
        buy1 = buys[n]
        buy2 = buys[n+1]
        maxIdx = df.iloc[buy1:buy2]['SPY'].idxmax()
        df.at[maxIdx, 'signal'] = -1

In [47]:
# Generate buy and sell signals
buy_signals()
buys = df.index[df['signal'] == 1].tolist()
sell_signals()

In [48]:
# df

In [49]:
plt.figure(figsize = (9,7))

# plot close price
df['SPY'].plot(color = 'k', label= 'Close Price') 

# plot buy prices
plt.plot(df[df['signal'] == 1].index, 
         df['SPY'][df['signal'] == 1], 
         '^', markersize = 6, color = 'lime', label = 'buy')

# plot sell prices
plt.plot(df[df['signal'] == -1].index, 
         df['SPY'][df['signal'] == -1], 
         'v', markersize = 6, color = 'r', label = 'sell')

plt.ylabel('Price in USD', fontsize = 15 )
plt.xlabel('Date', fontsize = 15 )
plt.title('SPY train and test data', fontsize = 20)
plt.legend()
plt.grid()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [50]:
# Set Date as index
datetime_index = pd.DatetimeIndex(pd.to_datetime(df['Date']))
df = df.set_index(datetime_index)
df = df.drop(['Date'], axis=1)

In [51]:
df

Unnamed: 0_level_0,SPY,pct_chg_1,pct_chg_5,diffSPY,VIX,RSI_7,RSI_14,diffRSI,MFI_7,MFI_14,signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-02-02,190.160004,-3.489990,-0.039993,-0.793997,21.980000,43.705808,48.319396,-8.474463,43.490759,46.867821,0
2016-02-03,191.300003,1.139999,3.169998,-0.287997,21.650000,52.635293,52.509172,-0.069949,40.760668,47.156871,0
2016-02-04,191.600006,0.300003,2.490005,-0.485995,21.840000,50.925505,51.690415,-0.987374,40.753772,45.120105,0
2016-02-05,187.949997,-3.650009,-5.770004,-2.982004,23.379999,43.222758,47.907477,-8.433757,40.917813,47.666535,0
2016-02-08,185.419998,-2.529999,-8.229996,-3.866003,26.000000,45.928279,49.046164,-5.104411,39.494709,40.017872,0
2016-02-09,185.429993,0.009995,-4.730011,-2.910007,26.540001,51.982192,51.628505,0.770506,37.826415,49.216243,0
2016-02-10,185.270004,-0.159988,-6.029999,-1.863995,26.290001,51.531728,51.440617,0.626237,38.173894,48.192515,0
2016-02-11,182.860001,-2.410004,-8.740005,-2.525998,28.139999,45.449512,48.903638,-4.381162,37.331668,40.262263,1
2016-02-12,186.630005,3.770004,-1.319992,1.508005,25.400000,60.419433,55.408669,9.604474,50.592040,45.591342,0
2016-02-16,189.779999,3.149994,4.360001,3.785999,24.110001,58.539147,54.656875,7.552941,49.690177,45.104568,0


In [53]:
# Output data as CSV
df.to_csv(r'SPY_train_test_data.csv')