# IntraDay Decision Tree

Attempt similar strategy to previous Decision Tree technique with technical indicators except using intraday data.

Data source : http://www.histdata.com/download-free-forex-historical-data/?/metatrader/1-minute-bar-quotes/EURUSD

## Prerequisites : 
- Download a couple months worth of intraday data
- Copy the \*.csv files into /data/histdata folder

In [83]:
from os import listdir
from ta import *
import pandas, numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [2]:
%matplotlib inline

In [3]:
from pylab import rcParams
rcParams['figure.figsize'] = 15, 10

In [11]:
data_dir = 'data/histdata/'
data_files = list(filter(lambda a : ('.csv' in a), listdir(data_dir)))
data_files

['DAT_MT_EURUSD_M1_201809.csv',
 'DAT_MT_EURUSD_M1_201808.csv',
 'DAT_MT_EURUSD_M1_201805.csv',
 'DAT_MT_EURUSD_M1_201804.csv',
 'DAT_MT_EURUSD_M1_201806.csv',
 'DAT_MT_EURUSD_M1_201807.csv']

In [84]:
df = pandas.read_csv(data_dir + 'DAT_MT_EURUSD_M1_201809.csv', names=['date','time','open', 'max', 'min', 'close', 'vol'])
df.describe()

Unnamed: 0,open,max,min,close,vol
count,14348.0,14348.0,14348.0,14348.0,14348.0
mean,1.161197,1.161272,1.161121,1.161197,0.0
std,0.003702,0.003702,0.003704,0.003702,0.0
min,1.15279,1.15294,1.1526,1.15274,0.0
25%,1.15865,1.15871,1.15858,1.15864,0.0
50%,1.16104,1.16115,1.16095,1.16104,0.0
75%,1.16293,1.16302,1.16286,1.16293,0.0
max,1.17213,1.17215,1.17195,1.17212,0.0


In [116]:
df['datetime'] = df['date'] + ' - ' + df['time']

In [119]:
df

Unnamed: 0,date,time,open,max,min,close,vol,datetime
0,2018.09.02,17:00,1.15950,1.15951,1.15950,1.15951,0,2018.09.02 - 17:00
1,2018.09.02,17:01,1.15951,1.15951,1.15951,1.15951,0,2018.09.02 - 17:01
2,2018.09.02,17:02,1.15950,1.15950,1.15950,1.15950,0,2018.09.02 - 17:02
3,2018.09.02,17:05,1.15951,1.15951,1.15951,1.15951,0,2018.09.02 - 17:05
4,2018.09.02,17:06,1.15950,1.15950,1.15950,1.15950,0,2018.09.02 - 17:06
5,2018.09.02,17:08,1.15950,1.15952,1.15950,1.15952,0,2018.09.02 - 17:08
6,2018.09.02,17:09,1.15991,1.15991,1.15950,1.15952,0,2018.09.02 - 17:09
7,2018.09.02,17:10,1.15950,1.15952,1.15950,1.15950,0,2018.09.02 - 17:10
8,2018.09.02,17:18,1.15927,1.15935,1.15923,1.15935,0,2018.09.02 - 17:18
9,2018.09.02,17:19,1.15935,1.15938,1.15935,1.15938,0,2018.09.02 - 17:19


In [147]:
def readDAT(filename):
    dateparse = lambda x: pd.datetime.strptime(x, '%Y.%m.%d')
#     df = pd.read_csv(infile, parse_dates=['datetime'], date_parser=dateparse)
    
    df = pandas.read_csv(filename, names=['date','time','open', 'max', 'min', 'close', 'vol'])
    df['datetime'] = df['date'] + ' - ' + df['time']
    df['datetime'] = pandas.to_datetime(df['datetime'], format='%Y.%m.%d - %H:%M')
    return df
    
def readAllDatForCurrency(data_dir, currencyCode):
    dataFileNames = list(filter(lambda a : (currencyCode in a), listdir(data_dir)))
    dfs = [readDAT(data_dir + name) for name in dataFileNames]
    allDf = pd.concat(dfs)
    allDf = allDf.sort_values(by=['datetime'], ascending=True).reset_index(drop=True)
    return allDf
    
def runCalculators(series, calculators=[]):
    df_obj = {}
    for ind, fn in enumerate(calculators):
        df_obj[fn.__name__] = fn(series)
    return pd.DataFrame(df_obj)


def splitData(df, split):
    train = df.iloc[:int(len(df)*split)]
    test = df.iloc[int(len(df)*split):]
    
    return train, test


def trainDecisionTree(inputDf, outputDf):
    clf = DecisionTreeRegressor(random_state=0)
    clf.fit(inputDf, outputDf)
    return clf


def makeTrades(predictions):
#     ceil = np.percentile(predictions, 75)
#     floor = np.percentile(predictions, 50)
    thresh = np.percentile(predictions, 60)
    
    trade_pos = np.where(predictions > thresh)[0]
    return trade_pos

def getCompoundGains(series, trades):
    gains = others.daily_return(series).shift(-1)[trades]
    gains = np.add(np.divide(gains, 100), 1.0)
    return np.product(gains)

In [148]:
allDf = readAllDatForCurrency(data_dir, 'EURUSD')
# allDf.sort_values(by=['datetime'], ascending=True).reset_index(drop=True)
allDf

Unnamed: 0,date,time,open,max,min,close,vol,datetime
0,2018.04.01,17:00,1.23186,1.23207,1.23186,1.23207,0,2018-04-01 17:00:00
1,2018.04.01,17:02,1.23209,1.23209,1.23207,1.23207,0,2018-04-01 17:02:00
2,2018.04.01,17:03,1.23212,1.23212,1.23212,1.23212,0,2018-04-01 17:03:00
3,2018.04.01,17:05,1.23207,1.23207,1.23202,1.23203,0,2018-04-01 17:05:00
4,2018.04.01,17:06,1.23205,1.23205,1.23205,1.23205,0,2018-04-01 17:06:00
5,2018.04.01,17:07,1.23206,1.23206,1.23200,1.23200,0,2018-04-01 17:07:00
6,2018.04.01,17:08,1.23198,1.23207,1.23198,1.23207,0,2018-04-01 17:08:00
7,2018.04.01,17:09,1.23208,1.23210,1.23208,1.23210,0,2018-04-01 17:09:00
8,2018.04.01,17:10,1.23209,1.23210,1.23209,1.23210,0,2018-04-01 17:10:00
9,2018.04.01,17:11,1.23205,1.23205,1.23175,1.23175,0,2018-04-01 17:11:00


In [149]:
allDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172247 entries, 0 to 172246
Data columns (total 8 columns):
date        172247 non-null object
time        172247 non-null object
open        172247 non-null float64
max         172247 non-null float64
min         172247 non-null float64
close       172247 non-null float64
vol         172247 non-null int64
datetime    172247 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(1), object(2)
memory usage: 10.5+ MB


In [153]:
def runDTBacktest(df, filename=None):

    input_calculators=[trend.dpo, trend.macd, trend.macd_signal, trend.macd_diff, momentum.tsi, momentum.rsi, trend.trix, volatility.bollinger_hband, volatility.bollinger_lband]
    output_calculators=[lambda s : others.daily_return(s).shift(-1)]
    
    df_inds = runCalculators(df.close, input_calculators)
    df_outs = runCalculators(df.close, output_calculators)
    df = pd.concat([df, df_inds, df_outs], axis=1, join_axes=[df.index])
    df = df.dropna()
    df_inds = df[df_inds.columns]
    df_outs = df[df_outs.columns]
    
    train_split = 0.9
    train_in, test_in = splitData(df_inds, train_split)
    train_out, test_out = splitData(df_outs, train_split)
    
    decTree = trainDecisionTree(train_in, train_out)
    preds = decTree.predict(test_in)
    
    trades = makeTrades(preds)
    totalGain = getCompoundGains(df.close, trades)
    
    return {
        'filename':filename,
        'gain' : totalGain,
        'df':df,
        'trades':trades,
        'test_df':pd.concat([df, test_in, test_out], axis=1, join_axes=[test_in.index])
    }

# metrics
def getMetrics(backTestResult):
    
    returns = np.add(np.divide(others.daily_return(backTestResult['test_df'].close),100),1)
    
    return {
        'filename' : backTestResult['filename'],
        'gain' : backTestResult['gain'],
        'total_gain' : np.product(returns),
        'beat_market' : backTestResult['gain'] > np.product(returns),
        'start_date' : backTestResult['test_df']['date'].iloc[0],
        'end_date' : backTestResult['test_df']['date'].iloc[-1]
#         'perfect_gain' : np.product(returns[np.where(returns > 1.0)])
    }


In [154]:
results = runDTBacktest(readAllDatForCurrency(data_dir, 'EURUSD'))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


In [155]:
sortedResults = sorted(list(map(getMetrics, [results])), key=lambda x: x['gain'])
sortedResults.reverse()
sortedResults

[{'filename': None,
  'gain': 1.0037269355747471,
  'total_gain': 0.9927304722203255,
  'beat_market': True,
  'start_date': '2018.08.29',
  'end_date': '2018.09.14'}]

In [89]:
results['gain']

0.9997162566296541