In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [2]:
data = pd.read_csv('five_minute.csv', parse_dates = ['time'], index_col = 'time')

In [3]:
data['returns'] = np.log(data / data.shift(1))

In [4]:
data

Unnamed: 0_level_0,price,returns
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 22:00:00+00:00,1.146580,
2019-01-01 22:05:00+00:00,1.146350,-0.000201
2019-01-01 22:10:00+00:00,1.146320,-0.000026
2019-01-01 22:15:00+00:00,1.146320,0.000000
2019-01-01 22:20:00+00:00,1.146530,0.000183
...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004
2019-12-30 23:40:00+00:00,1.120210,0.000027
2019-12-30 23:45:00+00:00,1.120295,0.000076
2019-12-30 23:50:00+00:00,1.120275,-0.000018


In [5]:
data['direction'] = np.sign(data['returns'])

In [6]:
data

Unnamed: 0_level_0,price,returns,direction
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 22:00:00+00:00,1.146580,,
2019-01-01 22:05:00+00:00,1.146350,-0.000201,-1.0
2019-01-01 22:10:00+00:00,1.146320,-0.000026,-1.0
2019-01-01 22:15:00+00:00,1.146320,0.000000,0.0
2019-01-01 22:20:00+00:00,1.146530,0.000183,1.0
...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004,-1.0
2019-12-30 23:40:00+00:00,1.120210,0.000027,1.0
2019-12-30 23:45:00+00:00,1.120295,0.000076,1.0
2019-12-30 23:50:00+00:00,1.120275,-0.000018,-1.0


In [7]:
data['direction'].value_counts()

 1.0    36058
-1.0    35702
 0.0     1959
Name: direction, dtype: int64

In [8]:
lags = 5

In [9]:
cols = []

for lag in range(1, lags + 1):
    col = 'lag{}'.format(lag)
    data[col] = data['returns'].shift(lag)
    cols.append(col)

data.dropna(inplace = True)

In [10]:
data

Unnamed: 0_level_0,price,returns,direction,lag1,lag2,lag3,lag4,lag5
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-01 22:30:00+00:00,1.146455,-0.000017,-1.0,-0.000048,0.000183,0.000000,-0.000026,-0.000201
2019-01-01 22:35:00+00:00,1.146455,0.000000,0.0,-0.000017,-0.000048,0.000183,0.000000,-0.000026
2019-01-01 22:40:00+00:00,1.146370,-0.000074,-1.0,0.000000,-0.000017,-0.000048,0.000183,0.000000
2019-01-01 22:45:00+00:00,1.146315,-0.000048,-1.0,-0.000074,0.000000,-0.000017,-0.000048,0.000183
2019-01-01 22:50:00+00:00,1.146475,0.000140,1.0,-0.000048,-0.000074,0.000000,-0.000017,-0.000048
...,...,...,...,...,...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004,-1.0,-0.000112,-0.000018,0.000022,-0.000004,0.000089
2019-12-30 23:40:00+00:00,1.120210,0.000027,1.0,-0.000004,-0.000112,-0.000018,0.000022,-0.000004
2019-12-30 23:45:00+00:00,1.120295,0.000076,1.0,0.000027,-0.000004,-0.000112,-0.000018,0.000022
2019-12-30 23:50:00+00:00,1.120275,-0.000018,-1.0,0.000076,0.000027,-0.000004,-0.000112,-0.000018


# Predicting Market Direction with Logistic Regression in Scikit Learn

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
lm = LogisticRegression(C = 1e6, max_iter = 100000, multi_class = 'ovr')
# sinc simple models tend to underfit the data, we have to reduce regularization as much as possible by increasing capital C to a high number like 1 million.

In [13]:
lm.fit(X = data[cols], y = data['direction'])

LogisticRegression(C=1000000.0, max_iter=100000, multi_class='ovr')

In [14]:
data['pred'] = lm.predict(data[cols])

In [15]:
data

Unnamed: 0_level_0,price,returns,direction,lag1,lag2,lag3,lag4,lag5,pred
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-01-01 22:30:00+00:00,1.146455,-0.000017,-1.0,-0.000048,0.000183,0.000000,-0.000026,-0.000201,1.0
2019-01-01 22:35:00+00:00,1.146455,0.000000,0.0,-0.000017,-0.000048,0.000183,0.000000,-0.000026,1.0
2019-01-01 22:40:00+00:00,1.146370,-0.000074,-1.0,0.000000,-0.000017,-0.000048,0.000183,0.000000,-1.0
2019-01-01 22:45:00+00:00,1.146315,-0.000048,-1.0,-0.000074,0.000000,-0.000017,-0.000048,0.000183,1.0
2019-01-01 22:50:00+00:00,1.146475,0.000140,1.0,-0.000048,-0.000074,0.000000,-0.000017,-0.000048,1.0
...,...,...,...,...,...,...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004,-1.0,-0.000112,-0.000018,0.000022,-0.000004,0.000089,1.0
2019-12-30 23:40:00+00:00,1.120210,0.000027,1.0,-0.000004,-0.000112,-0.000018,0.000022,-0.000004,1.0
2019-12-30 23:45:00+00:00,1.120295,0.000076,1.0,0.000027,-0.000004,-0.000112,-0.000018,0.000022,1.0
2019-12-30 23:50:00+00:00,1.120275,-0.000018,-1.0,0.000076,0.000027,-0.000004,-0.000112,-0.000018,-1.0


In [16]:
data['pred'].value_counts()

 1.0    41406
-1.0    32308
Name: pred, dtype: int64

In [None]:
hits = np.sign(data['direction'] * data['pred']).value_counts()

In [None]:
hits

In [None]:
hit_ratio = hits[1.0] / sum(hits)

In [None]:
hit_ratio

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_true  = data ['direction'], y_pred = data['pred'])

# In-sample Backtesting & the Look-Ahead Bias

In [None]:
data ['strategy'] = data['pred'] * data ['returns']

In [None]:
data 

In [None]:
data ['creturns'] = data ['returns'].cumsum().apply(np.exp)
data ['cstrategy'] = data ['strategy'].cumsum().apply(np.exp)

In [None]:
data[['creturns', 'cstrategy']].plot(figsize = (12, 8));

In [None]:
data

In [None]:
data['trades'] = data['pred'].diff().fillna(0).abs()

In [None]:
data['trades'].value_counts()

In [None]:
data

# Out-of-Sample Forward Testing

In [None]:
data  = pd.read_csv('../DataSets/test_set.csv', parse_dates = ['time'], index_col = 'time')

In [None]:
data 

In [None]:
data['returns'] = np.log(data / data.shift(1))

In [None]:
data['direction'] = np.sign(data ['returns'])

In [None]:
data 

In [None]:
lags = 5

In [None]:
cols = []

for lag in range (1, lags + 1):
    col = 'lag{}'.format(lag )
    data [col] = data['returns'].shift(lag)
    cols.append(col )
    
data.dropna(inplace = True )

In [None]:
data

In [None]:
data['pred'] = lm.predict(data[cols])

In [None]:
data

In [None]:
data['pred'].value_counts()

In [None]:
hits = np.sign(data['direction'] * data['pred']).value_counts()

In [None]:
hits

In [None]:
hit_ratio = hits[1.0] / sum(hits)

In [None]:
hit_ratio

In [None]:
data['strategy'] = data['pred'] * data['returns']

In [None]:
data['creturns'] = data['returns'].cumsum().apply(np.exp)
data ['cstrategy'] = data['strategy'].cumsum().apply(np.exp)

In [None]:
data[['creturns', 'cstrategy']].plot(figsize = (12, 8));

In [None]:
data['trades'] = data['pred'].diff().fillna(0).abs()

In [None]:
data['trades'].value_counts()

In [None]:
data