In [None]:
import pandas as pd
import functions as fn
import numpy as np
import matplotlib.pyplot as plt

# Strategy: Static Entries and Exits
Using minute level data from our in-sample period (2018), find the mean of the spread and the standard deviation of the spread.  Use a constant multiple of the spread standard deviation as the upper and lower thresholds for our mean reversion strategy.  Signals are generated during our trading period when our spread is above/below our upper/lower threshold.  Note, the actual positions we take need to be lagged by 1 minute from our signals.  Without this shift, our results would be reliant on high-frequency trading technology and latency.

In [None]:
# Start from 01-01-2018 and end at 12-31-2018
qqq2018 = pd.read_csv("../data/qqqMinute2018.csv")
has2018 = pd.read_csv("../data/hasMinute2018.csv")
aapl2018 = pd.read_csv("../data/aaplMinute2018.csv")
ttwo2018 = pd.read_csv("../data/ttwoMinute2018.csv")
idxx2018 = pd.read_csv("../data/idxxMinute2018.csv")
sbux2018 = pd.read_csv("../data/sbuxMinute2018.csv")
ctas2018 = pd.read_csv("../data/ctasMinute2018.csv")
alxn2018 = pd.read_csv("../data/alxnMinute2018.csv")
algn2018 = pd.read_csv("../data/algnMinute2018.csv")
payx2018 = pd.read_csv("../data/payxMinute2018.csv")

In [None]:
qqq2018 = qqq2018.set_index('date_time')
qqq2018 = qqq2018.rename(columns={'close': 'qqqclose'})

has2018 = has2018.set_index('date_time')
has2018 = has2018.rename(columns={'close': 'hasclose'})

aapl2018 = aapl2018.set_index('date_time')
aapl2018 = aapl2018.rename(columns={'close': 'aaplclose'})

ttwo2018 = ttwo2018.set_index('date_time')
ttwo2018 = ttwo2018.rename(columns={'close': 'ttwoclose'})

idxx2018 = idxx2018.set_index('date_time')
idxx2018 = idxx2018.rename(columns={'close': 'idxxclose'})

sbux2018 = sbux2018.set_index('date_time')
sbux2018 = sbux2018.rename(columns={'close': 'sbuxclose'})

ctas2018 = ctas2018.set_index('date_time')
ctas2018 = ctas2018.rename(columns={'close': 'ctasclose'})

alxn2018 = alxn2018.set_index('date_time')
alxn2018 = alxn2018.rename(columns={'close': 'alxnclose'})

algn2018 = algn2018.set_index('date_time')
algn2018 = algn2018.rename(columns={'close': 'algnclose'})

payx2018 = payx2018.set_index('date_time')
payx2018 = payx2018.rename(columns={'close': 'payxclose'})

## Dataset for Engle-Granger Basket in 2018 (In-Sample Period)

In [None]:
eg_basket_data2018 = qqq2018[['qqqclose']].join([has2018[['hasclose']], aapl2018[['aaplclose']], ttwo2018[['ttwoclose']], 
                                                 sbux2018[['sbuxclose']], ctas2018[['ctasclose']], alxn2018[['alxnclose']], 
                                                 algn2018[['algnclose']], payx2018[['payxclose']]], how='outer')
eg_basket_data2018 = eg_basket_data2018.dropna()
eg_basket_data2018.tail(10)

## Dataset for Johansen Basket in 2018 (In-Sample Period)

In [None]:
joh_basket_data2018 = qqq2018[['qqqclose']].join([has2018[['hasclose']], ttwo2018[['ttwoclose']], idxx2018[['idxxclose']], 
                               sbux2018[['sbuxclose']], ctas2018[['ctasclose']], alxn2018[['alxnclose']]], how='outer')
joh_basket_data2018 = joh_basket_data2018.dropna()
joh_basket_data2018.tail(10)

In [None]:
syntheticAssetLogPriceEG2018 = eg_basket_data2018[['hasclose', 'aaplclose', 'ttwoclose', 'sbuxclose', 
                               'ctasclose', 'alxnclose', 'algnclose', 'payxclose']].apply(np.log)
qqqLogPriceEG2018 = np.log(eg_basket_data2018['qqqclose'].values)

syntheticAssetLogPriceJoh2018 = joh_basket_data2018[['hasclose', 'ttwoclose', 'sbuxclose', 
                               'idxxclose', 'ctasclose', 'alxnclose']].apply(np.log)
qqqLogPriceJoh2018 = np.log(joh_basket_data2018['qqqclose'].values)

In [None]:
kf_eg2018 = fn.multivariateKalmanFilter(syntheticAssetLogPriceEG2018, qqqLogPriceEG2018)
state_means_eg2018, state_covs_eg2018 = kf_eg2018.filter(qqqLogPriceEG2018)
basket_size_eg2018 = len(syntheticAssetLogPriceEG2018.columns)
slopes_eg2018 = state_means_eg2018[:, np.arange(0, basket_size_eg2018, 1)]
#intercept2018 = state_means2018[:, basket_size2018]

In [None]:
syntheticAssetEstimateEG2018 = [np.dot(slopes_eg2018[i], syntheticAssetLogPriceEG2018.values[i].T)
                              for i in range(len(slopes_eg2018))]
spread_ts_eg2018 = qqqLogPriceEG2018- syntheticAssetEstimateEG2018
eg_basket_data2018.reset_index(inplace=True)
eg_basket_data2018 = eg_basket_data2018.rename(columns={'index': 'datetime'})
eg_basket_data2018['logspread'] = spread_ts_eg2018
eg_basket_data2018['spread'] = np.exp(spread_ts_eg2018)

eg_backtest_data2018 = eg_basket_data2018[['datetime', 'qqqclose', 'hasclose', 'aaplclose', 'ttwoclose', 'sbuxclose', 
                      'ctasclose', 'alxnclose', 'algnclose', 'payxclose', 'spread']]
diff_thresh_eg = fn.calculateDiffThresh(eg_backtest_data2018, q=0.2)

In [None]:
print(diff_thresh_eg)

In [None]:
plt.figure(figsize = (15,7))
plt.plot(eg_backtest_data2018['spread'])
plt.title("Engle-Granger Basket Spread (In-Sample)")
plt.ylabel("Spread ($)")
plt.xlabel("Date")
spread_avg_eg = eg_backtest_data2018['spread'].mean()
spread_sd_eg = eg_backtest_data2018['spread'].std()
c = 1.25
upper_threshold_eg = spread_avg_eg + c * spread_sd_eg
lower_threshold_eg = spread_avg_eg - c * spread_sd_eg
plt.axhline(upper_threshold_eg, linestyle = 'dashed', color = 'g')
plt.axhline(spread_avg_eg, linestyle = 'dashed', color = 'y')
plt.axhline(lower_threshold_eg, linestyle = 'dashed', color = 'r')

In [None]:
print("Engle-Granger Basket Spread Thresholds \n")
print("Mean: ", spread_avg_eg)
print("Lower Bar: ", lower_threshold_eg)
print("Upper Bar: ", upper_threshold_eg)

In [None]:
kf_Joh2018 = fn.multivariateKalmanFilter(syntheticAssetLogPriceJoh2018, qqqLogPriceJoh2018)
state_means_joh2018, state_covs_joh2018 = kf_Joh2018.filter(qqqLogPriceJoh2018)
basket_size_joh2018 = len(syntheticAssetLogPriceJoh2018.columns)
slopes_joh2018 = state_means_joh2018[:, np.arange(0, basket_size_joh2018, 1)]
#intercept = state_means[:, basket_size]

In [None]:
syntheticAssetEstimateJoh2018 = [np.dot(slopes_joh2018[i], syntheticAssetLogPriceJoh2018.values[i].T)
                              for i in range(len(slopes_joh2018))]
spread_ts_joh2018 = qqqLogPriceJoh2018- syntheticAssetEstimateJoh2018
joh_basket_data2018.reset_index(inplace=True)
joh_basket_data2018 = joh_basket_data2018.rename(columns={'index': 'datetime'})
joh_basket_data2018['logspread'] = spread_ts_joh2018
joh_basket_data2018['spread'] = np.exp(spread_ts_joh2018)

joh_backtest_data2018 = joh_basket_data2018[['datetime', 'qqqclose', 'hasclose','ttwoclose', 'sbuxclose', 
                      'ctasclose', 'alxnclose', 'spread']]
diff_thresh_joh = fn.calculateDiffThresh(joh_backtest_data2018, q=0.2)

In [None]:
print(diff_thresh_joh)

In [None]:
plt.figure(figsize = (15,7))
plt.plot(joh_basket_data2018['spread'])
plt.title("Johansen Basket Spread (In-Sample)")
plt.ylabel("Spread ($)")
plt.xlabel("Date")
spread_avg_joh = joh_basket_data2018['spread'].mean()
spread_sd_joh = joh_basket_data2018['spread'].std()
c = 1.25
upper_threshold_joh = spread_avg_joh + c * spread_sd_joh
lower_threshold_joh = spread_avg_joh - c * spread_sd_joh
plt.axhline(upper_threshold_joh, linestyle = 'dashed', color = 'g')
plt.axhline(spread_avg_joh, linestyle = 'dashed', color = 'y')
plt.axhline(lower_threshold_joh, linestyle = 'dashed', color = 'r')

In [None]:
print("Johansen Basket Spread Thresholds \n")
print("Mean: ", spread_avg_joh)
print("Lower Bar: ", lower_threshold_joh)
print("Upper Bar: ", upper_threshold_joh)

# Backtest and Out-of-Sample Period

In [None]:
eg_data = pd.read_csv("../data/eg_basket_data.csv")
joh_data = pd.read_csv("../data/joh_basket_data.csv")

In [None]:
eg_backtest_data = eg_data[['datetime', 'qqqclose', 'hasclose', 'aaplclose', 'ttwoclose', 'sbuxclose', 
                      'ctasclose', 'alxnclose', 'algnclose', 'payxclose', 'spread']]
eg_backtest_data = fn.createBars(eg_backtest_data, lower_threshold_eg, upper_threshold_eg, spread_avg_eg)

In [None]:
plt.figure(figsize = (15,7))
plt.plot(eg_backtest_data['spread'].iloc[-450:])
plt.plot(eg_backtest_data['upperband'].iloc[-450:], color='g')
plt.plot(eg_backtest_data['ema'].iloc[-450:], color='y')
plt.plot(eg_backtest_data['lowerband'].iloc[-450:], color='r')
plt.title("Engle-Granger Basket Spread and Static Thresholds (Out-of-Sample)")
plt.ylabel("Spread ($)")
plt.xlabel("Date")

In [None]:
joh_backtest_data = joh_data[['datetime', 'qqqclose', 'hasclose', 'ttwoclose', 'sbuxclose', 
                               'idxxclose', 'ctasclose', 'alxnclose', 'spread']]
joh_backtest_data = fn.createBars(joh_backtest_data, lower_threshold_joh, upper_threshold_joh, spread_avg_joh)

In [None]:
plt.figure(figsize = (15,7))
plt.plot(joh_backtest_data['spread'].iloc[-450:])
plt.plot(joh_backtest_data['upperband'].iloc[-450:], color='g')
plt.plot(joh_backtest_data['ema'].iloc[-450:], color='y')
plt.plot(joh_backtest_data['lowerband'].iloc[-450:], color='r')
plt.title("Johansen Basket Spread and Static Thresholds (Out-of-Sample)")
plt.ylabel("Spread ($)")
plt.xlabel("Date")

## Generating Signals and Positions
Positions are generated based on the thresholds determined in the in-sample period.  When the spread is below our lower threshold, a long signal is generated and we enter a long spread position in the subsequent minute.  Symmetrically, when the spread is above our upper threshold, a short signal is generated and we enter a short spread position in the subsequent minute.  When the spread reverts back to the mean from our in-sample period, we close our positions.

For our optimal positions, we aim to forecast the mean reversion by not only conditioning that the spread is above/below the upper/lower threshold, but also conditioning that the percentage difference in spread magnitude is in the bottom 20% of our in-sample period's percentage difference in spread from minute to minute.

In [None]:
eg_final_data = fn.createPositions(eg_backtest_data.copy()) # no overnight positions, all positions exited by EOD
eg_final_data_opt = fn.createOptimalPositions(eg_backtest_data.copy(), threshold=diff_thresh_eg)

In [None]:
joh_final_data = fn.createPositions(joh_backtest_data.copy()) # no overnight positions, all positions exited by EOD
joh_final_data_opt = fn.createOptimalPositions(joh_backtest_data.copy(), threshold=diff_thresh_joh)

## Trade Log
The trade log is generated to compile all the individual trades executed in our backtest and some information about them.  This includes the start and end time, the holding period, what position was taken (long or short), position size and value, as well as profit and return information.  The information in this dataset can be used to analyze the strategy and the properties of the trades executed by the strategy.

### Engle-Granger Basket

In [None]:
slopes_eg = eg_data[['hasSlope', 'aaplSlope', 'ttwoSlope', 'sbuxSlope', 'ctasSlope', 'alxnSlope', 'algnSlope', 'payxSlopes']]
prices_eg = eg_final_data[['hasclose', 'aaplclose', 'ttwoclose', 'sbuxclose', 
                       'ctasclose', 'alxnclose', 'algnclose', 'payxclose']].values

In [None]:
tradeLog_eg, minuteDf_eg = fn.constructTradeLog(eg_final_data['datetime'].values, eg_final_data['position'].values,
                               eg_final_data['qqqclose'].values, prices_eg, 
                               slopes_eg.values.round(3), stoploss = None,
                               lot_size = 1000)
tradeLog_eg.tail()

In [None]:
tradeLog_eg_opt, minuteDf_eg_opt = fn.constructTradeLog(eg_final_data_opt['datetime'].values, 
                                                        eg_final_data_opt['position'].values,
                                                        eg_final_data_opt['qqqclose'].values, prices_eg, 
                                                        slopes_eg.values.round(3), stoploss = None,
                                                        lot_size = 1000)
tradeLog_eg_opt.tail()

In [None]:
returns_df_eg = minuteDf_eg[['datetime']]
returns_df_eg['cumulative_returns'] = np.cumprod(1 + minuteDf_eg['returns'])
returns_df_eg['cumulative_returns_opt'] = np.cumprod(1 + minuteDf_eg_opt['returns'])
returns_df_eg = returns_df_eg.set_index('datetime')
returns_df_eg.plot(figsize=[15, 7], title='Engle-Granger Basket Strategy Returns')

In [None]:
daily_eg = fn.calculateDailyReturns(minuteDf_eg[['returns', 'datetime']])
daily_eg_opt = fn.calculateDailyReturns(minuteDf_eg_opt[['returns', 'datetime']])
sharpe_eg = fn.calculateAnnualizedSharpeRatio(daily_eg)
sharpe_eg_opt = fn.calculateAnnualizedSharpeRatio(daily_eg_opt)

In [None]:
print("Engle-Granger Basket Backtest Results")
print("Returns: ", (returns_df_eg['cumulative_returns'].iloc[-1] - 1) * 100, '%')
print("Returns (Optimized): ", (returns_df_eg['cumulative_returns_opt'].iloc[-1] - 1) * 100, '%')
print("Sharpe: ", sharpe_eg)
print("Sharpe (Optimized)", sharpe_eg_opt)

### Johansen Basket

In [None]:
slopes_joh = joh_data[['hasSlope', 'ttwoSlope', 'sbuxSlope', 'idxxSlope', 'ctasSlope', 'alxnSlope']]
prices_joh = joh_final_data[['hasclose', 'ttwoclose', 'sbuxclose', 'idxxclose', 'ctasclose', 'alxnclose']].values

In [None]:
tradeLog_joh, minuteDf_joh = fn.constructTradeLog(joh_final_data['datetime'].values, joh_final_data['position'].values,
                               joh_final_data['qqqclose'].values, prices_joh, 
                               slopes_joh.values.round(3), stoploss = None,
                               lot_size = 1000)
tradeLog_joh.tail()

In [None]:
tradeLog_joh_opt, minuteDf_joh_opt = fn.constructTradeLog(joh_final_data_opt['datetime'].values, 
                                                          joh_final_data_opt['position'].values,
                                                          joh_final_data_opt['qqqclose'].values, prices_joh, 
                                                        slopes_joh.values.round(3), stoploss = None,
                                                        lot_size = 1000)
tradeLog_joh_opt.tail()

In [None]:
returns_df_joh = minuteDf_joh[['datetime']]
returns_df_joh['cumulative_returns'] = np.cumprod(1 + minuteDf_joh['returns'])
returns_df_joh['cumulative_returns_opt'] = np.cumprod(1 + minuteDf_joh_opt['returns'])
returns_df_joh = returns_df_joh.set_index('datetime')
returns_df_joh.plot(figsize=[15, 7], title='Johansen Basket Strategy Returns')

In [None]:
daily_joh = fn.calculateDailyReturns(minuteDf_joh[['returns', 'datetime']])
daily_joh_opt = fn.calculateDailyReturns(minuteDf_joh_opt[['returns', 'datetime']])
sharpe_joh = fn.calculateAnnualizedSharpeRatio(daily_joh)
sharpe_joh_opt = fn.calculateAnnualizedSharpeRatio(daily_joh_opt)

In [None]:
print("Johansen Basket Backtest Results")
print("Returns: ", (returns_df_joh['cumulative_returns'].iloc[-1] - 1) * 100, '%')
print("Returns (Optimized): ", (returns_df_joh['cumulative_returns_opt'].iloc[-1] - 1) * 100, '%')
print("Sharpe: ", sharpe_joh)
print("Sharpe (Optimized)", sharpe_joh_opt)