In [1]:
import pandas as pd
import numpy as np

import mplfinance as fplt
import matplotlib.pyplot as plt

import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

import seaborn as sns

In [26]:
def load_file(data_path):
    df = pd.read_csv(data_path,
                         sep='\t',
                         names=['DateTime', 'Open', 'High', 'Low', 'Close', 'Volume'],
                         skiprows=1
                         )
    df[['Open','High','Low','Close']] = df[['Open','High','Low','Close']].apply(pd.to_numeric, errors='coerce')  
    df['Ticker'] = data_path
    df = df.tail(50000) #for testing purpose
    df.index = pd.DatetimeIndex(df['DateTime'])
    return df

In [27]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    if iteration == total: 
        print()

In [28]:
def sim_ts(df, s, ts=0.0001, max_len=144,dir='buy'):
    if dir=='buy':
        tsl=0
    else:
        tsl=np.inf
    for i in range(s,s+max_len):
        if dir=='buy':
            tsl = max(tsl, df.High.values[i] - ts)
            if df.Low.values[i]<tsl:
                return tsl
        else: 
            tsl = max(tsl, df.Low.values[i] + ts)
            if df.High.values[i]>tsl:
                return tsl            
    return df.Close.values[i]

# Prepare Data

In [29]:
data_paths = ["D:\\A484018\\dev\\GBPJPY_M5.csv",
              "D:\\A484018\\dev\\EURUSD_M5.csv",
              "D:\\A484018\\dev\\AUDCHF_M5.csv"]
              
dfs = [load_file(f) for f in data_paths]

In [30]:
for df in dfs:
    #row_size = 0.0001 #EURUSD
    row_size = df.Close.mean() * 0.001

    clen=144
    fblen=12
    oblen=12
    shift=120

    #trailing_stop=0.0010 #EURUSD
    trailing_stop=df.Close.mean() * 0.01
    max_trd_len=96

    break_step = df.Close.mean() * 0.001
    study_step=1

    l_min = df.Low.min()
    h_max = df.High.max()

    df['first_row'] = np.rint((df.Low-l_min) / row_size).astype(int)
    df['last_row'] = np.rint((df.High-l_min) / row_size).astype(int)

    npdf=np.zeros((11,len(df)))
    for s in range(clen,len(df)-clen-oblen,study_step):

        printProgressBar (s-clen, len(df)-2*clen-oblen-1, prefix = '', suffix = '')

        sub_df = df[s-clen:s]
        r_min =  sub_df.first_row.min() 
        c = np.zeros(sub_df.last_row.max() - r_min)       
        for v,a,b in zip(sub_df.Volume.values, sub_df.first_row, sub_df.last_row):
            c[a-r_min:b+1-r_min] += (v/(1+b-a))     
        id = np.arange(sub_df.first_row.min(),sub_df.last_row.max())
        idx = np.round(l_min + (row_size/2) + id*row_size,4)          
        profile = pd.Series(data=c, index=idx)

        stats = np.zeros((len(profile),len(profile)))
        for w in range(len(profile)//3,2*len(profile)//3):
            for i in range(0,len(profile)-w):
                stats[w,i]=profile.values[i:i+w].sum()/np.power(w,0.75)

        w,i = np.unravel_index(np.argmax(stats),stats.shape)

        c = ['blue']*len(profile)
        c[i:i + (w)] = ['red']*(w)

        chan_dn=profile.index[i] - break_step
        chan_up=profile.index[i+w] + break_step

        #Profile stats
        st1 = np.round(100*profile.values[i:i + w].sum() / profile.values.sum(),2)
        st2 = np.round(100*w/len(profile),2)
        st3 = np.round(100*(i+i+w)/(2*len(profile)),2)

        # final balance
        fb_up,fb_mid,fb_dn=0,0,0
        for j in range(s-fblen,s):
            fb_up += max(0,df.High.values[j] - chan_up) - max(0,df.Low.values[j] - chan_up)
            fb_mid += max(0,(df.High.values[j] - df.Low.values[j]) - max(0,df.High.values[j] - chan_up) - max(0,chan_dn - df.Low.values[j]))
            fb_dn += max(0,chan_dn - df.Low.values[j]) - max(0,chan_dn - df.High.values[j])

        tot = fb_up+fb_mid+fb_dn
        fb_up = np.round(100*fb_up/tot,2)
        fb_mid = np.round(100*fb_mid/tot,2)
        fb_dn = np.round(100*fb_dn/tot,2)

        #output
        ob_up,ob_mid,ob_dn=0,0,0
        for j in range(s,s+oblen):
            ob_up += max(0,df.High.values[j] - chan_up) - max(0,df.Low.values[j] - chan_up)
            ob_mid += max(0,(df.High.values[j] - df.Low.values[j]) - max(0,df.High.values[j] - chan_up) - max(0,chan_dn - df.Low.values[j]))
            ob_dn += max(0,chan_dn - df.Low.values[j]) - max(0,chan_dn - df.High.values[j])

        tot = ob_up+ob_mid+ob_dn
        ob_up = np.round(100*ob_up/tot,2)
        ob_mid = np.round(100*ob_mid/tot,2)
        ob_dn = np.round(100*ob_dn/tot,2)

        #trailling stop results
        ts_buy=sim_ts(df,s,ts=trailing_stop, max_len=max_trd_len,dir='buy')
        ts_sell=sim_ts(df,s,ts=trailing_stop, max_len=max_trd_len,dir='sell')

        npdf[:,s] = [ts_buy,ts_sell,st1,st2,st3,fb_up,fb_mid,fb_dn,ob_up,ob_mid,ob_dn]

        """    
        print('----------------------------------------------------------')
        print(s,df.DateTime.values[s])
        print('main profile:', st1,'%   /', st2,'%   /', st3,'%')
        print('final balance:', fb_up,'%   /', fb_mid,'%   /', fb_dn,'%')
        print('output balance:', ob_up,'%   /', ob_mid,'%   /', ob_dn,'%')
        """

    df[['ts_buy','ts_sell','mp%_in','mp%_size', 'mp%_dir','fb_up','fb_mid','fb_dn','ob_up','ob_mid','ob_dn']]=npdf.T
    df[['mp%_in_shift','mp%_size_shift', 'mp%_dir_shift']] = df[['mp%_in','mp%_size', 'mp%_dir']].shift(shift)

 |----------------------------------------------------------------------------------------------------| 0.6% 



 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
 |████████████████████████████------------------------------------------------------------------------| 28.9% 

ValueError: attempt to get argmax of an empty sequence

In [None]:
for df in dfs:
    df['buy_profit'] = df.ts_buy - df.Close 
    df['sell_profit']= df.Close - df.ts_sell

    df['adj_close'] = (df.High + df.Low + df.Close)/3

    df['sma12_var'] = (df['adj_close']/df['adj_close'].rolling(12).mean())-1
    df['sma48_var'] = (df['adj_close']/df['adj_close'].rolling(48).mean())-1
    df['sma180_var'] = (df['adj_close']/df['adj_close'].rolling(180).mean())-1

    df['spread']=((df['adj_close']/df['Open'])-1).abs()
    df['spread14_e']=df['spread'].ewm(span=14).mean()

    df['volume14_34_var'] = (df['Volume'].rolling(14).mean()/df['Volume'].rolling(34).mean())-1
    df['volume14_34_var'] = df['volume14_34_var'].fillna(0.0)

In [None]:
features = ['mp%_in', 'mp%_size', 'mp%_dir',
       'fb_up', 'fb_mid', 'fb_dn', 'ob_up', 'ob_mid', 'ob_dn', 'mp%_in_shift',
       'mp%_size_shift', 'mp%_dir_shift',
       'sma12_var', 'sma48_var', 'sma180_var', 'spread',
       'spread14_e', 'volume14_34_var']

dfs2=[]
for df in dfs:
    df.buy_profit = df.buy_profit.shift(-oblen)
    df['labels'] = (df.buy_profit > df.Close.mean() * 0.002)

    dfs2.append(df.dropna().copy())
    
df2 = pd.concat(dfs2)

# Run Models

In [None]:
X = df2[features]
y = df2['labels'].astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
RF = RandomForestClassifier(max_depth=10, random_state=411)
RF.fit(X_train, y_train)

DT = DecisionTreeClassifier(max_depth=16, random_state=411)
DT.fit(X_train, y_train)

In [None]:
y_pred_DT = DT.predict(X_test)
y_pred_RF = RF.predict(X_test)

In [None]:
print('RND full score:', y.sum()/y.count())
print('DT train score:',DT.score(X_train,y_train))
print("DT test_score:", DT.score(X_test,y_test))
print('RF train score:',RF.score(X_train,y_train))
print("RF test_score:", RF.score(X_test,y_test))

# Decision Tree

In [None]:
print('depth:',DT.get_depth())
print('n_leaves:',DT.get_n_leaves())

In [None]:
text_representation = sklearn.tree.export_text(DT, max_depth=2, feature_names=list(features), show_weights=True)
print(text_representation)

In [None]:
importances = DT.feature_importances_
std = np.std(DT.feature_importances_)
forest_importances = pd.Series(importances, index=features)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred_DT)
sns.heatmap(cf_matrix, annot=True, fmt='g')

# Random Forest

In [None]:
depth = [tree.get_depth() for tree in RF.estimators_]
n_leaves = [tree.get_n_leaves() for tree in RF.estimators_]
print('depth:',np.min(depth),np.max(depth),np.mean(depth))
print('n_leaves:',np.min(n_leaves),np.max(n_leaves),np.mean(n_leaves),np.sum(n_leaves))

In [None]:
importances = RF.feature_importances_
std = np.std([t.feature_importances_ for t in RF.estimators_], axis=0)
forest_importances = pd.Series(importances, index=features)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred_RF)
sns.heatmap(cf_matrix, annot=True, fmt='g')

# Testing models

In [None]:
w = np.sum((y_pred_DT == 1) & (y_test==1))
l = np.sum((y_pred_DT == 1) & (y_test==0))
print('score:',w/(w+l))
print('roc score:',roc_auc_score(y_test, y_pred_DT))
print('count:',np.sum(y_pred_DT))

In [None]:
indexes = X_test[y_pred_RF == 1].sort_index().index.values
mask=np.concatenate(([True],np.diff(indexes,1).astype(float) > 3.2000e+13))
df2.loc[indexes[mask]].sum()

In [None]:
"""s=200
ts = pd.to_datetime(str(df.index.values[s])) 

fplt.plot(
    df[s-10:s+100],
    type='candle',
    style='charles',
    hlines=[chan_dn,chan_up],
    vlines = ts.strftime('%Y-%m-%d %H:%M')
)"""