<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>



In [3]:

%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')


<IPython.core.display.Javascript object>

Creating the enviornment to load the file on the google drive and reading the file in the variable **prices_raw**


In [0]:
import pandas as pd
import numpy as np
prices_raw = pd.read_excel('exported.xslx')

In [0]:
prices_raw.shape


## Preprocessing & exploration

In [0]:
# parse timestamps correctly
for t in [u'date' , u'ebsMarketUpdateTime', u'feedHandlerPublishTime', u'feedHandlerReceiveTime', u'eventCaptureTime']:
    prices_raw[t] = pd.to_datetime(prices_raw[t])
    
print("First three rows of raw data")
prices_raw.head(3)

In [0]:
# creating various time related variable 
prices_raw["DayOfMonth"] = prices_raw["feedHandlerReceiveTime"].apply(lambda x: x.day)
prices_raw["Hour"]       = prices_raw["feedHandlerReceiveTime"].apply(lambda x: x.hour)
prices_raw["Minutes"]    = prices_raw["feedHandlerReceiveTime"].apply(lambda x: x.minute)
prices_raw["Seconds"]    = prices_raw["feedHandlerReceiveTime"].apply(lambda x: x.second)
prices_raw["WeekDay"]    = prices_raw["feedHandlerReceiveTime"].apply(lambda x: x.weekday())
prices_raw["DayOfYear"]  = prices_raw["feedHandlerReceiveTime"].apply(lambda x: x.timetuple().tm_yday)
prices_raw["time_diff"]   = prices_raw["feedHandlerReceiveTime"] - prices_raw["feedHandlerReceiveTime"].shift(1)

In [0]:
prices_raw.head()

In [0]:
# function to extract the total seconds from a timedelta object 
def convert_time_to_number(time_delta):
  return time_delta.total_seconds()

In [0]:
# creating a consecutive time difference column 
prices_raw["time_diff"] = prices_raw["time_diff"].apply(convert_time_to_number)

In [0]:
# set the first value of this column to zero
prices_raw["time_diff"][0] = 0.0
# next we creat elapsed time variable which is defined as the total time elapsed from the beginning 
prices_raw["elapsed_time"] = prices_raw["time_diff"].cumsum()
# We also consider the square of the elpased time 
prices_raw["elapsed_time_sq"] = prices_raw["elapsed_time"]**2
prices_raw.head()

In [0]:
prices = prices_raw[['DayOfMonth','Hour','WeekDay','Minutes','Seconds','time_diff','elapsed_time','elapsed_time_sq',
                     'date','bid','ask','bid2','ask2','bidSize1','askSize1','bidSize2','askSize2','paid', 'given']]

prices['bid'] = prices['bid'].replace(0,np.NaN)
prices['ask'] = prices['ask'].replace(0,np.NaN)
prices['bid2'] = prices['bid2'].replace(0,np.NaN)
prices['ask2'] = prices['ask2'].replace(0,np.NaN)

prices['paid'] = prices['paid'].replace(0,np.NaN)
prices['given'] = prices['given'].replace(0,np.NaN)
prices['mid'] =  prices['ask']
prices['mid'] = 0.5*(prices['bid'] + prices['mid'])

prices.index = prices_raw.feedHandlerReceiveTime

In [0]:
prices.head()

In [0]:
prices=prices.drop_duplicates()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 100

from_date = '2017.09.15'
to_date = '2017.09.30'
start_time = '03:00'
end_time = '15:00'

sym = '`USDINR'
site = "`LOH"

date = '2017.09.15'
prices['date'] = pd.to_datetime(prices.date)
prices[['mid']].plot(figsize=(15,5), title=sym+ ' mid timeserie '+date)
plt.show()

In [0]:
columns = ['bid','ask','bid2','ask2','bidSize1','askSize1','bidSize2','askSize2','mid']
prices_delta = prices[columns] - prices[columns].shift(1)
prices_delta.rename(columns = {'mid':'deltaMid','bid':'deltaBid','ask':'deltaAsk','bidSize1':'deltaBidSize1','askSize1':'deltaAskSize1',
                              'bidSize2':'deltaBidSize2','askSize2':'deltaAskSize2'}, inplace=True)

# add back old prices, and a midDiff for learning later
LL = ['mid','bid','ask','bidSize1','bidSize2','askSize1','askSize2']
prices_delta[LL] = prices[LL]
prices_delta['midDiffInterval'] = (prices_delta['deltaMid'] != 0).cumsum()

In [0]:
add_col = ['DayOfMonth','Hour','WeekDay','Minutes','Seconds','time_diff','elapsed_time', 'elapsed_time_sq']
prices_delta[add_col] = prices[add_col]

In [0]:
prices_delta.head()

In [0]:
# time feature (on feedHandlerRecieve), date,time ... 
prices_delta['date'] = prices.date
prices_delta['time'] = prices.index

# trade Features, print,tradeSeq,lastPaid,lastGiven,bidToPaid,bidToGiven,midToPaid ...
atomicTrades = prices[['paid','given']].loc[(prices['paid']>1) | (prices['given']>1)]
atomicTrades.loc[atomicTrades['paid'] <1, 'paid' ] = np.NaN
atomicTrades.loc[atomicTrades['given'] <1, 'given' ] = np.NaN
atomicTrades = atomicTrades.replace(0,np.NaN)
prices_delta['paid'] = atomicTrades['paid']
prices_delta['given'] = atomicTrades['given']
prices_delta['print'] = np.where((prices_delta['paid']>1) | (prices_delta['given']>1),1,0)
prices_delta['tradeSeq'] = prices_delta['print'].cumsum()
prices_delta['lastPaid'] = prices_delta['paid'].ffill()
prices_delta['lastGiven'] = prices_delta['given'].ffill()
prices_delta.drop('paid',1,inplace=True)
prices_delta.drop('given',1,inplace=True)
prices_delta['midToPaid'] = prices_delta['mid'] - prices_delta['lastPaid']
prices_delta['midToGiven'] = prices_delta['mid'] - prices_delta['lastGiven']
prices_delta['bidToPaid'] = prices_delta['bid'] - prices_delta['lastPaid']
prices_delta['bidToGiven'] = prices_delta['bid'] - prices_delta['lastGiven']
prices_delta['askToPaid'] = prices_delta['ask'] - prices_delta['lastPaid']
prices_delta['askToGiven'] = prices_delta['ask'] - prices_delta['lastGiven']

# book preasure feature
prices['book_pressure'] = prices['mid'] - (prices['bidSize1']*prices['bid'] + prices['askSize1']*prices['ask'])/(prices['bidSize1']+prices['askSize1'])
prices_delta['book_pressure'] = prices['mid'] - (prices['bidSize1']*prices['bid'] + prices['askSize1']*prices['ask'])/(prices['bidSize1']+prices['askSize1'])


# spread feature
prices_delta['spread'] = prices_delta['ask'] - prices_delta['bid']

# create feature to learn, ie next move (not to be used as covariates!)
prices_delta['midDiff'] = prices_delta['mid'].diff()
prices_delta['nextMidDiff'] = prices_delta['midDiff'].shift(-1)
prices_delta['nextMidVariation'] = prices_delta['nextMidDiff'].replace(to_replace=0, method='bfill')

prices_delta.dropna(inplace=True)

In [0]:
prices_delta.head()

In [0]:
from scipy import stats
prices_delta = prices_delta.replace(0,np.NaN)
prices_delta = prices_delta.replace(np.NaN,0)
prices_delta_clean = prices_delta[(np.abs(stats.zscore(prices_delta['deltaMid'])) < 5)]
prices_delta_clean = prices_delta_clean.replace(0,np.NaN)
prices_delta_clean = prices_delta_clean.replace(np.NaN,0)

In [0]:
print (prices_delta_clean.apply(lambda x: sum(x.isnull())))

In [0]:
prices_delta_clean.head()

## Features

Select the features that are going to be included in the model

In [0]:
# features = ['deltaBid','deltaAsk','deltaMid','midToPaid','midToGiven',
#             'bidSize1','askSize1','bidToPaid','askToGiven','bidToGiven',
#             'askToPaid', 'book_pressure','spread','bid2','ask2',
#             'DayOfMonth','Hour','WeekDay','Minutes','Seconds',
#             'time_diff','elapsed_time','elapsed_time_sq',
#             'bidSize2','askSize2','deltaBidSize2','deltaAskSize2'
#            ]
features = ['deltaBid','deltaAsk','deltaMid','midToPaid','midToGiven',
           'bidSize1','askSize1','bidToPaid','askToGiven','bidToGiven',
           'askToPaid', 'book_pressure','spread','bid2','ask2','DayOfMonth',
           'Hour','WeekDay','bidSize2','askSize2','deltaBidSize2','deltaAskSize2']

OUT = (prices_delta_clean.date == '2017.09.29') | (prices_delta_clean.date == '2017.09.28') 
OUT = OUT | (prices_delta_clean.date == '2017.09.27') 
IN = ~OUT

X_train = np.array(prices_delta_clean[IN][features].values)
y_train = np.array(prices_delta_clean[IN]['nextMidVariation'].values)
X_test = np.array(prices_delta_clean[OUT][features].values)
y_test = np.array(prices_delta_clean[OUT]['nextMidVariation'].values)

y_train[y_train<0] = 0
y_train[y_train>0] = 1
y_test[y_test<0] = 0
y_test[y_test>0] = 1


## LSTM

In [0]:
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
y_train_lstm=y_train
y_test_lstm=y_test
print(X_train_lstm.shape, X_test_lstm.shape)

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import BatchNormalization
from keras.layers import Activation
from keras import optimizers

In [0]:
model = Sequential()
model.add(LSTM(42,recurrent_dropout=0.2, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
model.add(Dense(42))
model.add(Dropout(0.1))


model.add(Dense(42))
model.add(Dropout(0.1))

model.add(Dense(1, activation='sigmoid'))
# For a binary classification problem
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

# fit network
history = model.fit(X_train_lstm, y_train_lstm, epochs=20, batch_size=10000, verbose=0, shuffle=False)


In [0]:
from matplotlib import pyplot
model.summary()
pyplot.plot(history.history['loss'], label='train')

pyplot.legend()
pyplot.show()

test_predictions_lstm=model.predict_classes(X_test_lstm)
test_predictions_lstm2=model.predict(X_test_lstm)
print('train:',model.evaluate(x=X_train_lstm,y=y_train_lstm,verbose=1))
print('test:',model.evaluate(x=X_test_lstm,y=y_test_lstm,verbose=1))

scores = model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

## Different classification models for comparison 

In [0]:
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# extreme gradient boosting parameters
xgb_params0={'colsample_bytree': 1.0, 'silent': 1, 'min_child_weight': 10,\
    'n_estimators': 300, 'subsample': 1, 'learning_rate': 0.001, 'objective': 'binary:logistic',\
    'seed': 10, 'max_depth': 7, 'gamma': 0.0}
xgb_params1={'colsample_bytree': 0.77, 'silent': 1, 'nthread': 8, 'min_child_weight': 15,\
    'n_estimators': 50, 'subsample': 0.77, 'learning_rate': 0.09, 'objective': 'binary:logistic',\
    'seed': 11, 'max_depth': 6, 'gamma': 0.2}
xgb_params2={'colsample_bytree': 1, 'silent': 1, 'nthread': 8, 'min_child_weight': 10,\
    'n_estimators': 50, 'subsample': 1, 'learning_rate': 0.095, 'objective': 'binary:logistic',\
    'seed': 10, 'max_depth': 7, 'gamma': 0.}
xgb_params3={'colsample_bytree': 1, 'silent': 1, 'nthread': 8, 'min_child_weight': 10,\
    'n_estimators': 30, 'subsample': 0.95, 'learning_rate': 0.07, 'objective': 'binary:logistic',\
    'seed': 10, 'max_depth': 7, 'gamma': 0.}

# extreme gradient boosting model
model = XGBClassifier(**xgb_params0)

# logistic regression model with L1 penalty

model_logit = LogisticRegression(C = 10, penalty='l1', tol=0.001)

# SVC model with RBF kernel
# model_SVC = SVC(kernel='rbf', C=1.0, decision_function_shape='ovr')

randomforest = RandomForestClassifier(max_depth=6,n_estimators=1000)



# Fit the algorithm
model.fit(X_train, y_train)
model_logit.fit(X_train,y_train)
# model_SVC.fit(X_train,y_train)
randomforest.fit(X_train,y_train)


In [0]:
%matplotlib inline

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(prices_delta_clean[features].columns, model.feature_importances_):
    feats[feature] = importance #add the name/value pair 

# get the feature importances and plot them     
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance', ascending= False).plot(kind='bar', rot=45)

In [0]:
# make predictions on the test set 
predictions = model.predict(X_test)
predictions_logit=model_logit.predict(X_test)
predictions_randomforest=randomforest.predict(X_test)

In [0]:
# print accuracy and classification report
from sklearn import metrics
print(metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test, predictions))

print(metrics.accuracy_score(y_test, predictions_logit))
print(metrics.classification_report(y_test, predictions_logit))

print(metrics.accuracy_score(y_test, predictions_randomforest))
print(metrics.classification_report(y_test, predictions_randomforest))



## Test accuracies of different models 

Logistic Regression model : 0.49

Support vector machine with RBF kernel : 0.56

Logistic Regression with L1 penalty (C = 10, tol = 0.001) : 0.58

Extreme Gradient Boosting Trees : 0.59

LSTM : 0.55

random forest : 0.59

### combining predictions

In [0]:
test_predictions_lstm3=np.concatenate(test_predictions_lstm, axis=0 )
# predictions
# predictions_logit

allpredictions = pd.DataFrame({'lstm': test_predictions_lstm3, 
                               'xgb': predictions,'logit':predictions_logit,'randomforest':predictions_randomforest}
                       , columns=['lstm', 'xgb','logit','randomforest'])


allpredictions2=allpredictions.replace(-1,0)

In [0]:
allpredictions.shape
# y_test.shape

## Meta classifier neural net

In [0]:
model = Sequential()

# Input layer linear step
model.add(Dense(32, input_dim=len(allpredictions.columns)))
# hidden layer linear step
model.add(Dense(16))
# # Hidden layer normalization
# model.add(BatchNormalization())
# # Hidden layer activation

# Add another dropout layer
model.add(Dropout(rate=0.5))
# # Output layer, 

# Output normalization
model.add(BatchNormalization())
model.add(Dense(1))
# Output layer activation
model.add(Activation('sigmoid'))

# Setup adam optimizer
# adam_optimizer=adam(lr=0.1,
#                 beta_1=0.9, 
#                 beta_2=0.999, 
#                 epsilon=1e-08)

# Compile the model
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history_batchnorm = model.fit(allpredictions2.values,y_test, # Train on training set
                             epochs=600, # We will train over 1,000 epochs
                             batch_size=allpredictions.shape[0], # Batch size = training set size
                              # Render to tensorboard
                             verbose=0) # Suppress Keras output    

In [0]:

# plt.plotplot((history_dropout.history_ .history['acc'], label ='Dropout'))
plt.plot(history_batchnorm.history['acc'], label = 'Dropout + Batchnorm')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

final_predictions_score=model.evaluate(x=allpredictions2,y=y_test)
final_predictions_NN=model.predict_classes(allpredictions2)

In [0]:
print(final_predictions_score[1])