### Modeling High-Frequency Limit Order Book Dynamics Using Machine Learning 

In [69]:
from statistics import mean

from matplotlib import pyplot as plt
from matplotlib.pyplot import plot
from numpy import std
from sklearn.model_selection import GridSearchCV
import seaborn as sns
%pylab inline
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier,\
                              GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression


%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [70]:
def read_csv(day_trade):
    data_up = []
    data_down = []
    path = "/Users/zengyan/Excelsior/SGX-Full-OrderBook-Tick-Data-Trading-Strategy/Data_Transformation/Train_Test_Builder/order_book_3_2014"
    # path = '/home/rory/Demo/Data_Transformation/Train_Test_Builder/order_book_3_2014'
    for j,i in enumerate(day_trade):
        for k in range(0,len(i),1):
            path_up = path + '_' + str(j+1) + '_' + str(i[k]) + '_' + 'UP' + '.csv'
            path_down = path + '_' + str(j+1) + '_' + str(i[k]) + '_' + 'DOWN' + '.csv'
            data_up.append(pd.read_csv(path_up))
            data_down.append(pd.read_csv(path_down))
            #print path_down
    return data_up,data_down

### Example : 2014/1/2 

In [71]:
day_trade = [[2]]

In [72]:
data_2014_up, data_2014_down = read_csv(day_trade)

### Column = 0 : label[0 : not traded,1 : traded] & Column = 1~ : Features values 

In [73]:
data_2014_up[0].head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.897292,-0.054134,0.794007,-0.114823,0.967576,-0.016479,0.913658,-0.045119
1,0.0,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,...,1.0,0.0,0.897292,-0.054134,0.794007,-0.114823,0.967576,-0.016479,0.913658,-0.045119
2,0.0,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,-0.14065,...,1.0,0.0,0.897292,-0.054134,0.794007,-0.114823,0.967576,-0.016479,0.913658,-0.045119
3,0.0,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,...,1.18871,0.08622,1.199119,0.090545,1.21165,0.095698,1.254026,0.112699,1.246215,0.109613
4,0.0,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,...,1.172581,0.079436,1.172687,0.079481,1.172816,0.079535,1.246706,0.109808,1.230279,0.103251
5,0.0,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,...,0.930645,-0.035923,0.90837,-0.048015,0.881553,-0.062951,1.027086,0.013362,0.991235,-0.004402
6,0.0,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,...,1.017742,0.008793,1.051101,0.024914,1.091262,0.04364,1.066618,0.032235,1.077291,0.037208
7,0.0,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,...,1.033871,0.016653,1.077533,0.03732,1.130097,0.061076,1.073939,0.035651,1.093227,0.044537
8,0.0,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,...,1.03871,0.018987,1.085463,0.04098,1.141748,0.066183,1.076135,0.036671,1.098008,0.046715
9,0.0,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,0.21097,...,0.946269,-0.027607,0.935409,-0.033374,0.923577,-0.03973,1.031073,0.015299,1.002214,0.001106


### Machine learning algorithms

In [74]:
models = {
    'RandomForestClassifier': RandomForestClassifier(random_state = 0),
    'ExtraTreesClassifier': ExtraTreesClassifier(random_state = 0),
    # 'AdaBoostClassifier': AdaBoostClassifier(base_estimator = DecisionTreeClassifier(),n_estimators = 10,random_state = 0)
    # 'GradientBoostingClassifier': GradientBoostingClassifier(random_state = 0),
    # 'SVC': SVC(probability=True,random_state = 0),
}

### Grids for Hyperparameter Tuning

In [75]:
model_grid_params = {
    'RandomForestClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
                               'min_samples_split':[2],'criterion':['entropy'],\
                               'min_samples_leaf':[3]},
    'ExtraTreesClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
                             'min_samples_split':[2],'criterion':['entropy'],\
                             'min_samples_leaf':[3]},
    'AdaBoostClassifier': {"base_estimator__criterion" : ["entropy"],\
                           "base_estimator__max_depth": [None],\
                           "base_estimator__min_samples_leaf" : [3],\
                           "base_estimator__min_samples_split" : [2],\
                           "base_estimator__max_features" : [None]},
    'GradientBoostingClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
                                   'min_samples_split':[2],'min_samples_leaf':[3],\
                                   'learning_rate':[0.1],'subsample':[1.0]},
    'SVC': [{'kernel':['rbf'],'gamma':[1e-1],'C':[1]},\
            {'kernel':['linear'],'C':[1,10]}]
}

### Model Selection Pipline

In [76]:
class Model_Selection:
    
    def __init__(self,models,model_grid_params,data_2014,latest_sec,pred_sec,day):
        
        self.models = models
        self.model_grid = model_grid_params
        self.data_2014 = data_2014
        self.latest_sec = latest_sec
        self.pred_sec = pred_sec
        self.day = day
        self.keys = models.keys()
        self.best_score = {}
        self.grid = {}
        self.predict_values = {}
        self.cv_acc = {}
        self.acc = {}
        self.fscore = {}
        self.true_values = {}
        self.predict_values_day = {}
        self.cv_acc_day = {}
        self.acc_day = {}
        self.fscore_day = {}
        self.true_values_day = {}
        self.summary_day = []
        
    def Grid_fit(self,X_train,y_train,cv = 5,scoring = 'accuracy'):
        
        for key in self.keys:
            print("Running GridSearchCV for %s." %(key))
            model = self.models[key]
            model_grid = self.model_grid[key]
            Grid = GridSearchCV(model, model_grid, cv = cv, scoring = scoring)
            Grid.fit(X_train,y_train) 
            self.grid[key] = Grid
            print(Grid.best_params_)
            print('CV Best Score = %s'%(Grid.best_score_))
            self.cv_acc[key].append(Grid.best_score_)  
    
    def model_fit(self,X_train, y_train, X_test, y_test):
        
        for key in self.keys:
            print("Running training & testing for %s." %(key))
            model = self.models[key]
            model.set_params(**self.grid[key].best_params_)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            #print 'Prediction latest 15 second = %s'%(predictions)
            self.predict_values[key].append(predictions.tolist())
            self.true_values[key].append(y_test.tolist())
            acc = metrics.accuracy_score(y_test,predictions)
            f_score = metrics.f1_score(y_test,predictions)
            print('Accuracy = %s'%(acc))
            self.acc[key].append(acc)
            self.fscore[key].append(f_score)
            
            if key == 'SVC':
                if self.grid[key].best_params_.values()[0] == 'linear':
                    feature_imp = dict(zip([i for i in range(0,64,1)],model.coef_[0]))
                    Top_five = sorted(feature_imp.items(),key = lambda x : x[1] , reverse=True)[0:5]
                    #print 'Kernel is linear and top five importance features = %s'%(Top_five)
                else:
                    #print 'Kernel is rbf'
                    pass
            else: 
                feature_imp = dict(zip([i for i in range(0,64,1)],model.feature_importances_))
                Top_five = sorted(feature_imp.items(),key = lambda x : x[1] , reverse=True)[0:5]
                #print 'Top five importance features = %s'%(Top_five)
                pass

    def pipline(self):
        
        self.set_list_day() # store day values
        for day in range(0,self.day,1):
            self.set_list() # store values
            print('Day = %s'%(day+1))
            for i in range(0,10,self.pred_sec):#9000-self.latest_sec-600,self.pred_sec):
                
                print('--------------------Rolling Window Time = %s--------------------'%(i/pred_sec))
                # Train data
                data_train = self.data_2014[day][i:i+self.latest_sec]
                X_train = data_train.drop(['0'],axis=1)#,'65','66','67'],axis=1)
                y_train = data_train['0']

                # Test data
                data_test = self.data_2014[day][i + self.latest_sec:i + self.latest_sec + self.pred_sec]
                X_test = data_test.drop(['0'],axis=1)#,'65','66','67'],axis=1)
                y_test = data_test['0']
                
                #start = time.time()
                self.Grid_fit(X_train, y_train, cv = 5, scoring = 'accuracy')
                self.model_fit(X_train, y_train,X_test,y_test)
                end = time.time()
                print('Total Time = %s'%(end - start))
                
            for key in self.keys:
                
                self.cv_acc_day[key].append(self.cv_acc[key])
                self.acc_day[key].append(self.acc[key])
                self.fscore_day[key].append(self.fscore[key])
                self.true_values_day[key].append(self.true_values[key])
                self.predict_values_day[key].append(self.predict_values[key])
            
            self.summary_day.append(self.score_summary(sort_by = 'Accuracy_mean'))
    
    def set_list(self):
        
        for key in self.keys:
            self.predict_values[key] = []
            self.cv_acc[key] = []
            self.acc[key] = []
            self.fscore[key] = []
            self.true_values[key] = []
            
    def set_list_day(self):
        
        for key in self.keys:
            self.predict_values_day[key] = []
            self.cv_acc_day[key] = []
            self.acc_day[key] = []
            self.fscore_day[key] = []
            self.true_values_day[key] = []
            
    def score_summary(self,sort_by):
        
        summary = pd.concat([pd.DataFrame(self.acc.keys()),pd.DataFrame(map(lambda x: mean(self.acc[x]), self.acc)),\
                             pd.DataFrame(map(lambda x: std(self.acc[x]), self.acc)),\
                             pd.DataFrame(map(lambda x: max(self.acc[x]), self.acc)),\
                             pd.DataFrame(map(lambda x: min(self.acc[x]), self.acc)),\
                             pd.DataFrame(map(lambda x: mean(self.fscore[x]), self.fscore))],axis=1)
        summary.columns = ['Estimator','Accuracy_mean','Accuracy_std','Accuracy_max','Accuracy_min','F_score']
        summary.index.rename('Ranking', inplace=True)
        return summary.sort_values(by = [sort_by], ascending=False)
          
    def print_(self):
        print(self.predict_values)


In [77]:
latest_sec = 60 * 30
pred_sec = 10
day = 1
data_2014_up, data_2014_down = read_csv(day_trade)
data_2014 = data_2014_up
pip = Model_Selection(models,model_grid_params,data_2014,latest_sec,pred_sec,day)

### Start Machine Learning Pipline

In [78]:
start = time.time()
pip.pipline()
end = time.time()
print('Total Time = %s'%(end-start))

Day = 1
--------------------Rolling Window Time = 0.0--------------------
Running GridSearchCV for RandomForestClassifier.
{'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.6805555555555556
Running GridSearchCV for ExtraTreesClassifier.
{'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.7305555555555556
Running training & testing for RandomForestClassifier.
Accuracy = 0.9
Running training & testing for ExtraTreesClassifier.
Accuracy = 1.0
Total Time = 1.0086650848388672
Total Time = 1.0099620819091797


### Metrics 

In [79]:
pip.summary_day#.reset_index(drop = True)

[                      Estimator  Accuracy_mean  Accuracy_std  Accuracy_max  \
 Ranking                                                                      
 1          ExtraTreesClassifier            1.0           0.0           1.0   
 0        RandomForestClassifier            0.9           0.0           0.9   
 
          Accuracy_min   F_score  
 Ranking                          
 1                 1.0  1.000000  
 0                 0.9  0.947368  ]

In [80]:
# pip.summary_day[1]#.reset_index(drop = True)

In [81]:
# pip.summary_day[2]#.reset_index(drop = True)

In [82]:
sns.set_style("whitegrid")
plt.figure(figsize = (18,6))
color_ = ['r','b']
plot(data_2014[1]['66'],label = 'Best Ask',color = color_[1])
plot(data_2014[1]['67'],label = 'Best Bid',color = color_[0])
plt.legend(loc=0)
plt.xlabel('Time(s)',size = 15)
plt.ylabel('Price',size = 15)

IndexError: list index out of range

<Figure size 1800x600 with 0 Axes>

### Accuracy in one day

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize = (18,6))
color = []
for key in pip.keys:
    plot(np.array(pip.acc_day[key])[0],'-o',label = key,lw = 1,markersize = 3)
    plt.legend(loc=0)
plt.ylim(-0.5,1.5)
plt.legend(loc=0)
plt.xlabel('Rolling Window Numbers',size = 15)
plt.ylabel('Accuracy',size = 15)

### Cross Validation 

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize = (18,6))
color_ = ['r','orange','y','g','b']
for index,key in enumerate(pip.keys):
    plot(np.array(pip.cv_acc_day[key])[0],'-o',label = key,color = color_[index],lw = 1,markersize = 3)
#plot(best_cv_score,'-v',label = 'Best cv 5 folds score',color = 'violet',lw = 1,markersize = 6)
plt.legend(loc=0)
plt.xlabel('Rolling Window Numbers',size = 15)
plt.ylabel('CV Mean Accuracy',size = 15)

### Best Model

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize = (18,6))
plot(best_cv_score,'-o',label = 'Best cv 5 folds score',color = 'violet',lw = 1,markersize = 5)
plt.legend(loc=0)
plt.xlabel('Rolling Window Numbers',size = 15)
plt.ylabel('CV Mean Accuracy',size = 15)

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize = (18,6))
color_ = ['r','orange','y','g','b']
for index,key in enumerate(pip.keys):
    plot(np.array(pip.cv_acc_day[key])[0][0:250],'-o',label = key,color = color_[index],lw = 1,markersize = 5)
#plot(best_cv_score,'-v',label = 'Best cv 5 folds score',color = 'violet',lw = 1,markersize = 6)
plt.legend(loc=0)
plt.xlabel('Rolling Window Numbers',size = 15)
plt.ylabel('CV Mean Accuracy',size = 15)
plt.ylim(0.55,1)

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize = (18,6))
plot(best_cv_score[0:250],'-o',label = 'Best cv 5 folds score',color = 'violet',lw = 1,markersize = 5)
plt.legend(loc=0)
plt.xlabel('Rolling Window Numbers',size = 15)
plt.ylabel('CV Mean Accuracy',size = 15)
plt.ylim(0.55,1)

### Profit & Loss

In [None]:
# compute cum_profit and Best_cv_score
dict_ = {}
dict_['cum_profit'] = []
dict_['Best_cv_score'] = []

for day in range(0,1,1):
    cum_profit_label = []
    cum_profit = []
    best_cv_score = []
    spread = 0.2 * data_2014[day]['65'][1800:][9::10].values
    loss = 0.2*(data_2014[0]['67'][1800:9000-600][9::10].values - data_2014[day]['67'][1800+600:9000][9::10].values)
    for j in range(0,len(pip.cv_acc_day.values()[0][day]),1):
        max_al = {}
        for i in range(0,len(pip.keys),1):
            max_al[pip.keys[i]] = np.array(pip.cv_acc_day[pip.keys[i]])[day][j]
        # select best algorithm in cv = 5    
        top_cv_acc = sorted(max_al.items(),key = lambda x : x[1], reverse = True)[0:1][0]
        best_cv_score.append(top_cv_acc[1])
        submission = pip.predict_values_day[top_cv_acc[0]][day][j][-1]
        true_value = pip.true_values_day[top_cv_acc[0]][day][j][-1]

        if submission == true_value:
            if submission == 1:
                cum_profit_label.append(1)
                cum_profit.append(spread[j])
            elif submission == 0:
                cum_profit_label.append(0)
                cum_profit.append(0)
        elif submission != true_value:
            if submission == 1:
                cum_profit_label.append(-1)
                cum_profit.append(loss[j])
            elif submission == 0:
                cum_profit_label.append(0)
                cum_profit.append(0)
                
    dict_['cum_profit'].append(cum_profit)
    dict_['Best_cv_score'].append(best_cv_score)

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize = (20,8))
plt.subplot(211)
plot(cum_profit,'-o',label = 'Profit & Loss',lw = 1,markersize = 3)
plt.ylabel('Tick',size = 15)
plt.legend(loc=0)
plt.ylim(-7.5,2.5)
plt.subplot(212)
plot(cumsum(cum_profit),'-o',label = 'Cum Profit',lw = 1,markersize = 2)
plt.legend(loc=0)
plt.xlabel('Rolling Window Numbers',size = 15)
plt.ylabel('Profit',size = 15)