In [40]:
import math 
import numpy as np 
import pandas as pd 
from datetime import date, timedelta, datetime
from pandas.plotting import register_matplotlib_converters
import matplotlib.pyplot as plt 
import matplotlib.dates as mdates
from sklearn.metrics import mean_absolute_error, mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import RobustScaler
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix
import datetime
import backtrader as bt
import backtrader.analyzers as btanalyzers

from strategy import *

In [41]:
import warnings
warnings.filterwarnings('ignore')

# Step #1 Load the Time Series Data

# Step #4 Train-Test Split

In [42]:
def train_test_split(df):
    train = df.loc[:'2017-12-10',:]
    test = df.loc['2017-12-11':, :]
    return train, test

def check_length(train,test,df):
    print('value_counts for training state:\n',train.state.value_counts())
    print('\n')
    print('value_counts for testing state:\n',test.state.value_counts())
    print('\n')
    print('length of train data:',len(train), '\nlength of test data:',len(test))
    print('\n')
    print('proportion of train data:',len(train)/len(df),'\nproportion of test data:',len(test)/len(df))
    print('\n')

# Step #5 Scaling and Transforming the Data

In [43]:
def scale_transform_data(train, test):
    f_columns = train.columns[:-1]
    f_transformer = RobustScaler()
    f_transformer = f_transformer.fit(train[f_columns].to_numpy())
    train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy())
    test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy())
    return train,test

# Step #6 Cuting time series into sub sequencies & oneHot encoder  for y

In [44]:
#reshape to [#samples, #time_steps, #n_features]
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps),:-1].values 
        Xs.append(v)        
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys).reshape(-1, 1)

In [45]:
def onehotencoder_y(y_train,y_test):
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(y_train)
    y_train = enc.transform(y_train).toarray()
    y_test = enc.transform(y_test).toarray()
    return y_train, y_test, enc
    # return y_train, y_test

# Step #7 Build LSTM & Bidirectional LSTM Model on Train data in Keras

In [46]:
def weights(train_data):
    class_weights = class_weight.compute_class_weight('balanced',np.unique(train_data.iloc[:,-1]),(train_data.iloc[:,-1]))
    class_weights = dict(enumerate(class_weights))
    return class_weights

In [47]:
def Bi_LSTM_model(X_train,y_train):
    model = keras.Sequential()
    model.add(
    keras.layers.Bidirectional(
        keras.layers.LSTM(units=128,input_shape=[X_train.shape[1], X_train.shape[2]])))
    #model.add(keras.layers.Dropout(rate=0.3))
    model.add(keras.layers.Dense(units=64, activation='relu'))
    model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [48]:
def LSTM_model(X_train,y_train):
    model = keras.Sequential()
    model.add(LSTM(units=64,input_shape=[X_train.shape[1], X_train.shape[2]]))
    model.add(keras.layers.Dropout(rate=0.5))
    #model.add(keras.layers.Dense(units=64, activation='relu'))
    #model.add(keras.layers.Dropout(rate=0.5))
    model.add(keras.layers.Dense(units=32, activation='relu'))
    model.add(keras.layers.Dropout(rate=0.5))
    model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [49]:
def LSTM_model_1(X_train,y_train):
    model = keras.Sequential()
    model.add(LSTM(units=64,input_shape=[X_train.shape[1], X_train.shape[2]]))
    model.add(keras.layers.Dropout(rate=0.5))
    #model.add(keras.layers.Dense(units=64, activation='relu'))
    #model.add(keras.layers.Dropout(rate=0.5))
    model.add(keras.layers.Dense(units=32, activation='relu'))
    model.add(keras.layers.Dropout(rate=0.5))
    model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [50]:
def Bi_LSTM_model_1(X_train,y_train):
    model = keras.Sequential()
    model.add(
    keras.layers.Bidirectional(
        keras.layers.LSTM(units=128,input_shape=[X_train.shape[1], X_train.shape[2]])))
    #model.add(keras.layers.Dropout(rate=0.3))
    model.add(keras.layers.Dense(units=64, activation='relu'))
    model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [51]:
def fit_LSTM(is_weighted,X_train,y_train,model,weights=None):
    if is_weighted:
        history = model.fit(
            X_train, y_train,
            epochs=50,
            batch_size=32,
            validation_split=0.1,
            shuffle=False,
            verbose=0,
            class_weight=weights)
    else:
        history = model.fit(
            X_train, y_train,
            epochs=50,
            batch_size=32,
            validation_split=0.1,
            verbose = 0,
            shuffle=False)
    return history

# Step #8 Fit Bidirectional LSTM Model on Test data in Keras

In [52]:
def pred(model,X_test):
    y_pred = model.predict(X_test)
    return y_pred

In [53]:
def compute_final_df(enc,y_pred,test):

    a = enc.inverse_transform(y_pred)
    b = enc.inverse_transform(y_pred).shape[0]
    y_pred2 = np.reshape(a, (1, b))[0][-978:]

    final_pred_df = test.loc['2018':,:]
    final_pred_df = final_pred_df.drop('state', 1)
    final_pred_df.insert(loc=test.shape[1]-1,column='state',value=y_pred2) 
    final_pred_df = final_pred_df.reset_index()
    
    final_pred_df['state'] = final_pred_df['state'].map({0: -1, 1: 0,2: 1})

    test['state'] = test['state'].map({0: -1, 1: 0,2: 1})

    final_y_pred = final_pred_df.loc[:,'state']
    final_y_test = test.loc['2018':,'state']
    
    return final_y_test,final_y_pred,final_pred_df

In [54]:
# def compute_final_df(y_train_noenc,y_test_noenc,y_pred,test):
#     enc = onehotencoder_y(y_train_noenc,y_test_noenc)[2]
#     a = enc.inverse_transform(y_pred)
#     b = enc.inverse_transform(y_pred).shape[0]
#     y_pred2 = np.reshape(a, (1, b))[0][-978:]
#     final_pred_df = test.loc['2018':,:]
#     final_pred_df.insert(loc=test.shape[1],column='State',value=y_pred2)
#     final_pred_df = final_pred_df.drop('state', 1)
#     final_pred_df['State'] = final_pred_df['State'].map({0: -1, 1: 0,2: 1})
#     final_pred_df = final_pred_df.reset_index()
#     final_y_pred = final_pred_df.loc[:,'State']
    
#     test['state'] = test['state'].map({0: -1, 1: 0,2: 1})
#     final_y_test = test.loc['2018':,'state']
#     return final_y_test,final_y_pred,final_pred_df

# Step #9 Evaluate

In [55]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [56]:
def evaluate(history,X_train, y_train, y_train_noenc,y_test_noenc,X_test,y_test,y_pred,final_y_test,final_y_pred):
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='validation')
    plt.legend()
    print('\n')
    
    X_train =X_train[4:]
    y_train = y_train[4:]
    X_test = X_test[4:]
    y_test = y_test[4:]
    y_pred = y_pred[4:]
    
    scores1 = model.evaluate(X_train, y_train, verbose=0)
    print('Accuracy on training data: {}% \n Error on training data: {}'.format(scores1[1], 1 - scores1[1]))   

    scores2 = model.evaluate(X_test, y_test, verbose=0)
    print('Accuracy on test data: {}% \n Error on test data: {}'.format(scores2[1], 1 - scores2[1]))
    print('\n')
    
    print('model_evaluate:',model.evaluate(X_test, y_test))
    print('\n')
    
    cnf_matrix = confusion_matrix(final_y_test, final_y_pred,labels=[-1,0,1])
    np.set_printoptions(precision=2)
    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=['Bear', 'Static', 'Bull'],title='Confusion matrix')
    print('\n')

# Step #10 Compute CSV

# BackTesting

In [57]:
def operation(dataname,features_list,train_start,strategy,
is_weighted = False,is_LSTM=True):
    # Read data
    df = pd.read_csv(dataname+'.csv', index_col=0)
    df.index = pd.to_datetime(df.index)
    # df = df[features_list]
    df['state'] = df['state'].map({-1: 0, 0: 1,1: 2})

    train = df.loc[train_start:'2017-12-10',:]
    test = df.loc['2017-12-11':, :]

    train,test = scale_transform_data(train, test)
    
    time_steps = 3 # Use how many days x to predict next y
    X_train, y_train_noenc = create_dataset(train, train.state, time_steps)
    X_test, y_test_noenc = create_dataset(test, test.state, time_steps)

    y_train,y_test,enc = onehotencoder_y(y_train_noenc,y_test_noenc)

    # Get weights
    if is_weighted:
        weight = weights(train)
    else:
        weight = None
    
    if is_LSTM:
        model = LSTM_model_1(X_train,y_train)
        history = fit_LSTM(is_weighted,X_train,y_train,model,weight)
        y_pred = pred(model,X_test)
    else:
        model = Bi_LSTM_model_1(X_train,y_train)
        history = fit_LSTM(is_weighted,X_train,y_train,model,weight)
        y_pred = pred(model,X_test)
        
    
    final_y_test, final_y_pred, final_pred_df = compute_final_df(enc,y_pred,test)
    
    dataf = final_pred_df
    dataf['Date']=pd.to_datetime(dataf['Date'])

    cerebro = bt.Cerebro()

    num = dataf.shape[1]-1
    data = PandasData(dataname = dataf,fromdate=datetime.datetime(2018, 1, 2),todate=datetime.datetime(2021, 12, 30),datetime = 0,open = 1,high = 2,low = 3, close = 4,state = num)

    # Add data to Cerebro
    cerebro.adddata(data)

    # Add strategy to Cerebro
    cerebro.addstrategy(strategy)

    # Default position size
    cerebro.addsizer(bt.sizers.SizerFix, stake=1)

    # Add analytics to Cerebro
    cerebro.addanalyzer(btanalyzers.SharpeRatio, _name='SharpeRatio')
    cerebro.addanalyzer(btanalyzers.AnnualReturn, _name='AnnualReturn')
    cerebro.addanalyzer(btanalyzers.DrawDown, _name='DrawDown')
    # cerebro.addanalyzer(btanalyzers.TimeDrawDown, _name='TimeDrawDown')
    cerebro.addanalyzer(btanalyzers.PositionsValue, _name='PositionsValue')
    cerebro.addanalyzer(btanalyzers.LogReturnsRolling, _name='LogReturnsRolling')
    cerebro.addanalyzer(btanalyzers.PeriodStats, _name='PeriodStats')
    cerebro.addanalyzer(btanalyzers.Returns, _name='Returns')
    cerebro.addanalyzer(btanalyzers.TradeAnalyzer, _name='TradeAnalyzer')
    cerebro.addanalyzer(btanalyzers.Transactions, _name='Transactions')

    
    # Run Cerebro Engine
    start_portfolio_value = cerebro.broker.getvalue()

    # cerebro.run()
    thestrats = cerebro.run()
    thestrat = thestrats[0]

    end_portfolio_value = cerebro.broker.getvalue()
    pnl = end_portfolio_value - start_portfolio_value
    sharpe = thestrat.analyzers.SharpeRatio.get_analysis()['sharperatio']
    
    #print(f'Starting Portfolio Value: {start_portfolio_value:2f}')
    #print(f'Final Portfolio Value: {end_portfolio_value:2f}')
    # print('n=',n,'weight=',weight,f'PnL= {pnl:.2f}\n')
    return pnl, sharpe

In [58]:
# datanames = ['y1_2003','y2_2003','y3_2003','y4_2003','y5_2003','gmm_1_labeled_2003','hmm_1_labeled_2003']
datanames = ['y2_2003','y5_2003','gmm_1_labeled_2003','hmm_1_labeled_2003']
features_list = ['Open','High','Low','Close','momentum_rsi', 'volatility_bbp', 'volatility_dcp', 'momentum_ppo', 'momentum_tsi', 'trend_cci', 'momentum_ppo_signal', 'momentum_roc', 'trend_macd', 'trend_macd_signal', 'trend_adx_pos', 'momentum_stoch', 'trend_trix', 'trend_vortex_ind_diff', 'momentum_wr', 'trend_vortex_ind_neg', 'volatility_kcp', 'trend_adx_neg', 'trend_kst_sig', 'momentum_ao', 'volatility_ui', 'trend_kst', 'volatility_dcw', 'volatility_kcw', 'trend_vortex_ind_pos','state']
train_start_list = ['2003-01-01','2010-01-04','2015-01-02']
is_weighted_list = [False]
is_LSTM_list = [True,False]

pnl_result = []
pnl = 0
for dataname in datanames:
    for train_start in train_start_list:
        for is_weighted in is_weighted_list:
            for is_LSTM in is_LSTM_list:
                pnl, sharpe = operation(dataname,features_list,train_start,MarketStatus,is_weighted,is_LSTM)
                pnl_dic = {'dataname':dataname,
                            'train_start':train_start,
                            'weighted':is_weighted,
                            'LSTM':is_LSTM,
                            'pnl':pnl,
                            'sharpe':sharpe}
                pnl_result.append(pnl_dic)

# pnl_result
# data_items = pnl_result.items()
# data_list = list(data_items)
# df_pnl = pd.DataFrame(data_list)
# df_pnl.sort_values(by=1,ascending = False)
pnl_result

[{'dataname': 'y2_2003',
  'train_start': '2003-01-01',
  'weighted': False,
  'LSTM': True,
  'pnl': -5.437297511816723,
  'sharpe': -84.91345266250002},
 {'dataname': 'y2_2003',
  'train_start': '2003-01-01',
  'weighted': False,
  'LSTM': False,
  'pnl': -7.149506312298399,
  'sharpe': -71.11690272672949},
 {'dataname': 'y2_2003',
  'train_start': '2010-01-04',
  'weighted': False,
  'LSTM': True,
  'pnl': 1.0324471811727562,
  'sharpe': -18.37369939298428},
 {'dataname': 'y2_2003',
  'train_start': '2010-01-04',
  'weighted': False,
  'LSTM': False,
  'pnl': 20.480597100424347,
  'sharpe': -7.20750741278443},
 {'dataname': 'y2_2003',
  'train_start': '2015-01-02',
  'weighted': False,
  'LSTM': True,
  'pnl': 55.68931353087282,
  'sharpe': -2.7436712415978772},
 {'dataname': 'y2_2003',
  'train_start': '2015-01-02',
  'weighted': False,
  'LSTM': False,
  'pnl': 83.79190953961006,
  'sharpe': -1.5142058531636218},
 {'dataname': 'y5_2003',
  'train_start': '2003-01-01',
  'weighted'

In [59]:
pnl_result_df = pd.DataFrame(pnl_result)
pnl_result_df.sort_values(by='pnl',ascending=False)
pnl_result_df.to_excel('0 nn_result.xlsx')

In [60]:
pnl_result_df.sort_values(by='pnl',ascending=False)

Unnamed: 0,dataname,train_start,weighted,LSTM,pnl,sharpe
14,gmm_1_labeled_2003,2010-01-04,False,True,310.634065,-0.342248
17,gmm_1_labeled_2003,2015-01-02,False,False,171.749713,-0.768692
16,gmm_1_labeled_2003,2015-01-02,False,True,171.018954,-0.82769
23,hmm_1_labeled_2003,2015-01-02,False,False,144.136153,-0.805147
5,y2_2003,2015-01-02,False,False,83.79191,-1.514206
22,hmm_1_labeled_2003,2015-01-02,False,True,69.389808,-0.557063
18,hmm_1_labeled_2003,2003-01-01,False,True,63.641477,-3.796594
4,y2_2003,2015-01-02,False,True,55.689314,-2.743671
15,gmm_1_labeled_2003,2010-01-04,False,False,37.605356,-1.87536
12,gmm_1_labeled_2003,2003-01-01,False,True,37.111888,-2.764289


In [61]:
pnl_result_df.sort_values(by='pnl',ascending=False)[:10]

Unnamed: 0,dataname,train_start,weighted,LSTM,pnl,sharpe
14,gmm_1_labeled_2003,2010-01-04,False,True,310.634065,-0.342248
17,gmm_1_labeled_2003,2015-01-02,False,False,171.749713,-0.768692
16,gmm_1_labeled_2003,2015-01-02,False,True,171.018954,-0.82769
23,hmm_1_labeled_2003,2015-01-02,False,False,144.136153,-0.805147
5,y2_2003,2015-01-02,False,False,83.79191,-1.514206
22,hmm_1_labeled_2003,2015-01-02,False,True,69.389808,-0.557063
18,hmm_1_labeled_2003,2003-01-01,False,True,63.641477,-3.796594
4,y2_2003,2015-01-02,False,True,55.689314,-2.743671
15,gmm_1_labeled_2003,2010-01-04,False,False,37.605356,-1.87536
12,gmm_1_labeled_2003,2003-01-01,False,True,37.111888,-2.764289
