# Spread Trading

## Utility Functions

In [32]:
# <include-spread_trading/utils.py>

## Imports

In [65]:
# <imports>
import datetime
from enum import Enum
from dataclasses import dataclass
import sqlite3


import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import trange

from spread_trading import utils

pd.options.plotting.backend = "plotly"

## Fetching Data from Quandle

[End of Day US Stock Prices](https://www.quandl.com/data/EOD-End-of-Day-US-Stock-Prices/documentation)

Security   | Description
-----------|-------------------------------------------------------------------------------------------------------
SIVR |[Aberdeen Standard Physical Silver Shares ETF](https://www.quandl.com/data/EOD/SIVR)
SLV  | [iShares Silver Trust ](https://www.quandl.com/data/EOD/SLV)
FCOM    | [Fidelity MSCI Communication Services Index ETF](https://www.quandl.com/data/EOD/FCOM)
VOX    | [Vanguard Communication Services ETF](https://www.quandl.com/data/EOD/VOX)

In [34]:
start_date = '2018-12-02'
end_date = '2020-12-31'


pairs = (("SIVR", "SLV"), ("FCOM", "VOX"))
data_feed = "EOD"

sec_list = [f"{data_feed}/{s}" for s in sum(pairs, ())]

query_params = {
    "dataset": sec_list,
    "start_date": start_date, "end_date": end_date
}

In [35]:
data = utils.fetch_data(query_params, data_dir=".")

Loading 912db6fc461bab17deb741d895b46edb.csv from disk.


## Data Preparation

Flatten the data.

In [36]:
df_all = pd.DataFrame()
for c in data.columns:
    if c[-2:] not in ["_x", "_y"]:
        df_all = pd.concat([df_all, utils.expand_series(data[c], data_feed)])
df_all.head()

Unnamed: 0,date,data_feed,security,series,value
0,2018-12-03,EOD,SIVR,open,14.11
1,2018-12-04,EOD,SIVR,open,14.22
2,2018-12-06,EOD,SIVR,open,14.03
3,2018-12-07,EOD,SIVR,open,14.12
4,2018-12-10,EOD,SIVR,open,14.1


Use groupby to create rows that include columns for each series.

In [37]:
def set_cols(c):
    return c[0] if not c[1] else c[1]

df_g = df_all.groupby(df_all.columns[:-1].to_list()).max().unstack("series").reset_index()
df_g.columns = list(map(set_cols, df_g.columns))
df_g["date"] = pd.to_datetime(df_g["date"])
df_g["adj_dollar_volume"] = df_g["adj_close"] * df_g["adj_volume"]
df_data = df_g.set_index(["security", "date"]).unstack("security")
df_data.tail()

Unnamed: 0_level_0,data_feed,data_feed,data_feed,data_feed,adj_close,adj_close,adj_close,adj_close,adj_high,adj_high,...,split,split,volume,volume,volume,volume,adj_dollar_volume,adj_dollar_volume,adj_dollar_volume,adj_dollar_volume
security,FCOM,SIVR,SLV,VOX,FCOM,SIVR,SLV,VOX,FCOM,SIVR,...,SLV,VOX,FCOM,SIVR,SLV,VOX,FCOM,SIVR,SLV,VOX
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-12-24,EOD,EOD,EOD,EOD,44.801716,24.92,23.96,118.425537,44.971457,24.97,...,1.0,1.0,27895.0,374081.0,10558562.0,27910.0,1249744.0,9322098.52,252983100.0,3305257.0
2020-12-28,EOD,EOD,EOD,EOD,45.410787,25.38,24.43,119.903235,45.50065,25.8299,...,1.0,1.0,74824.0,729095.0,29365536.0,144241.0,3397817.0,18504431.1,717400000.0,17294960.0
2020-12-29,EOD,EOD,EOD,EOD,45.410787,25.36,24.39,119.89325,45.629454,25.49,...,1.0,1.0,66109.0,593527.0,23844516.0,125896.0,3002062.0,15051844.72,581567700.0,15094080.0
2020-12-30,EOD,EOD,EOD,EOD,45.129716,25.74,24.76,119.184355,45.600498,25.74,...,1.0,1.0,53689.0,774576.0,18440246.0,74465.0,2422969.0,19937586.24,456580500.0,8875063.0
2020-12-31,EOD,EOD,EOD,EOD,45.430757,25.55,24.57,120.013064,45.459513,25.69,...,1.0,1.0,45358.0,786957.0,24614049.0,154295.0,2060648.0,20106751.35,604767200.0,18517420.0


## Add Returns and Dollar Volume

### Add Returns

In [38]:
returns = utils.get_returns(df_data["adj_close"], return_type="log")
df_data = pd.concat([df_data, returns], axis=1)

### Add Dollar Volume

In [39]:
med_adj_volume = df_data[["adj_dollar_volume"]].shift(1).rolling(15).median()
med_adj_volume.columns = pd.MultiIndex.from_tuples(
    tuples=[(f"med_dollar_volume", security) for security in med_adj_volume.columns.get_level_values(1)],
    names=["series", "security"],
)
df_data = pd.concat([df_data, med_adj_volume], axis=1).dropna()
df_data.tail()

Unnamed: 0_level_0,data_feed,data_feed,data_feed,data_feed,adj_close,adj_close,adj_close,adj_close,adj_high,adj_high,...,adj_dollar_volume,adj_dollar_volume,adj_return,adj_return,adj_return,adj_return,med_dollar_volume,med_dollar_volume,med_dollar_volume,med_dollar_volume
security,FCOM,SIVR,SLV,VOX,FCOM,SIVR,SLV,VOX,FCOM,SIVR,...,SLV,VOX,FCOM,SIVR,SLV,VOX,FCOM,SIVR,SLV,VOX
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-12-24,EOD,EOD,EOD,EOD,44.801716,24.92,23.96,118.425537,44.971457,24.97,...,252983100.0,3305257.0,-0.000446,0.006845,0.0067,0.000675,3136007.0,14115816.5,586563300.0,12526640.0
2020-12-28,EOD,EOD,EOD,EOD,45.410787,25.38,24.43,119.903235,45.50065,25.8299,...,717400000.0,17294960.0,0.013503,0.018291,0.019426,0.012401,3136007.0,13800820.6,586563300.0,12526640.0
2020-12-29,EOD,EOD,EOD,EOD,45.410787,25.36,24.39,119.89325,45.629454,25.49,...,581567700.0,15094080.0,0.0,-0.000788,-0.001639,-8.3e-05,3159410.0,18082275.75,637540000.0,12526640.0
2020-12-30,EOD,EOD,EOD,EOD,45.129716,25.74,24.76,119.184355,45.600498,25.74,...,456580500.0,8875063.0,-0.006209,0.014873,0.015056,-0.00593,3159410.0,15051844.72,586563300.0,12526640.0
2020-12-31,EOD,EOD,EOD,EOD,45.430757,25.55,24.57,120.013064,45.459513,25.69,...,604767200.0,18517420.0,0.006648,-0.007409,-0.007703,0.006929,3002062.0,18082275.75,586563300.0,12526640.0


### Quick Check

In [40]:
sample_series = df_data[("adj_dollar_volume", "FCOM")].iloc[-16:-1]
assert len(sample_series) == 15
assert sample_series.median() == df_data[("med_dollar_volume", "FCOM")].iloc[-1]

dataframe with closing prices, s, trade_sizes as iterrm

## Strategy

For context, below is chart of the distribution of daily returns for each spread.
* All of the securities exprienced heightened volatility during the pandemic period
* The mean returns wer 0.001 for SIVR-SLV and 0.001 for FCOM-VOX
* Standard deviation for SIVR-SLV was 0.045 and 0.016 for FCOM-VOX


In [41]:
utils.make_spread_charts(pairs, df_data, "Spread Charts", price_col="adj_close")

In [42]:
utils.make_tail_charts(
    pairs, df_data,
    title_text="ETF Spreads - Distribution of Daily Log Returns",
    data_feed="EOD",
    price_col="adj_close",
    return_type='log',
    moments_xanchors=("left", "left")
)

Implementation notes:
* Assumes transaction cost of $0.01 per share on both opening and closing of short and long positions.
* Does not include any received carry on short positions
* Does not include a stop loss parameter
* Red dots indicate trades to short the spread and green dots indicate trades to buy the spread
* There is some opacity in the dots so that you can tell when double transactions ocurred due to the reversal of the spread
* The green line in the returns chart is total return, the orange one is the return for an incremental tick
* The stats box to the right of the returns chart is for the tick returns
* `open_threshold` refers to the spread threshold above which a position is opened
* `close_threshold` refers to the spread threshold below which open positions are closed

## Baseline for SIVR-SLV

#### Parameters
* Capital of $1,180,152 
* Rolling 15-day average for the spreads
* `open_threshold` of 0.00161 - 80th percentile of absolute returns
* `close_threshold` of 0.00034 - 20th percentile of absolute returns

#### Results
* Resulted in a loss of 0.0019, $2,298
* Note that transaction costs were $10,259 - at tranasction cost of $0.005 per share the strategy would have made 0.0024, $2,832
* This strategy appears to generate most of its profit in two short period with higher volatility, one in March of 2020 and antoher in September of 2020


In [43]:
strategy_params = dict(
    pair=pairs[0],
    window=15,
    open_threshold=0.00161,
    close_threshold=0.00034,
    run=True,
    transact_cost_per_share = 0.01,
    closed_positions = [],
)
strategy_params["df_ticks"] = utils.get_ticks(strategy_params["pair"], df_data, strategy_params["window"])

strategy = utils.Strategy(**strategy_params)
strategy.plot()

### Experiment 1: Longer Window
* Below are the results for a 60 day window
* This strategy loses 0.0039, $4,588

In [44]:
strategy_params = dict(
    pair=pairs[0],
    window=60,
    open_threshold=0.00161,
    close_threshold=0.00034,
    run=True,
    transact_cost_per_share = 0.01,
    closed_positions = [],
)
strategy_params["df_ticks"] = utils.get_ticks(strategy_params["pair"], df_data, strategy_params["window"])

strategy = utils.Strategy(**strategy_params)
strategy.plot()

In [45]:
window_returns = []
for window in [5, 10, 15, 30, 60, 90]:
    strategy_params = dict(
        pair=pairs[0],
        window=window,
        open_threshold=0.00161,
        close_threshold=0.00034,
        run=True,
        transact_cost_per_share = 0.01,
        closed_positions = [],
    )
    strategy_params["df_ticks"] = utils.get_ticks(strategy_params["pair"], df_data, strategy_params["window"])

    strategy = utils.Strategy(**strategy_params)
    window_returns.append({"window": window, "profit": strategy.net_profit, "capital": strategy.capital})

* This strategy appears to perform better with shorter window lengths, but at the given return thresholds it loses money regardless of window length.

In [46]:
df_win_ret = pd.DataFrame(window_returns)
df_win_ret["return"] = np.log(1 + df_win_ret.profit/df_win_ret.capital)
fig = df_win_ret.plot(x="window", y="return", template='none', title="Returns by Window Length", kind="bar")
fig.update_xaxes(type="category")
fig.show()

## Baseline for FCOM-VOX

#### Parameters
* Capital of $140,935
* Rolling 15-day average for the spreads
* `open_threshold` of 0.00181 - 80th percentile of absolute returns
* `close_threshold` of 0.00037 - 20th percentile of absolute returns

#### Results
* Resulted in a loss of 0.0071, $1,005
* Most of the profit was generated in a small number of trades in March of 2020



In [47]:
strategy_params = dict(
    pair=pairs[1],
    window=15,
    open_threshold=0.00181,
    close_threshold=0.00037,
    run=True,
    transact_cost_per_share = 0.01,
    closed_positions = [],
)
strategy_params["df_ticks"] = utils.get_ticks(strategy_params["pair"], df_data, strategy_params["window"])

strategy = utils.Strategy(**strategy_params)
strategy.plot()

### Experiments
* For the FCOM-VOX spread the experiment will be on the return window.

In [48]:
abs_spread = strategy.df_ticks.spread.abs()
def get_thresholds(thresholds):
    return (abs_spread.quantile(thresholds[0]), abs_spread.quantile(thresholds[1]))

In [49]:
threshold_list = [(0.05, 0.95), (0.10, 0.90), (0.20, 0.80), (0.30, 0.70), (0.40, 0.60), (0.50, 0.90), (0.70, 0.95), (0.05, 0.20)]

In [50]:
return_bands = []
for return_band in threshold_list:
    close_threshold, open_threshold = get_thresholds(return_band)
    strategy_params = dict(
        pair=pairs[1],
        window=15,
        open_threshold=open_threshold,
        close_threshold=close_threshold,
        run=True,
        transact_cost_per_share = 0.01,
        closed_positions = [],
    )
    strategy_params["df_ticks"] = utils.get_ticks(strategy_params["pair"], df_data, strategy_params["window"])

    strategy = utils.Strategy(**strategy_params)
    return_bands.append({"band": str(return_band), "profit": strategy.net_profit, "capital": strategy.capital})

* Here it looks like the best beforming band was one that was narrower (0.30, 0.70) than the original band from the 20th percentile to the 80th percentile.

In [51]:
df_band_ret = pd.DataFrame(return_bands)
df_band_ret["return"] = np.log(1 + df_band_ret.profit/df_band_ret.capital)
fig = df_band_ret.plot(x="band", y="return", template='none', title="Returns by Spread Difference Thresholds", kind="bar")
fig.update_xaxes(type="category")
fig.show()

In [52]:
close_threshold, open_threshold = get_thresholds((0.30, 0.70))
strategy_params = dict(
    pair=pairs[1],
    window=15,
    open_threshold=open_threshold,
    close_threshold=close_threshold,
    run=True,
    transact_cost_per_share = 0.01,
    closed_positions = [],
)
strategy_params["df_ticks"] = utils.get_ticks(strategy_params["pair"], df_data, strategy_params["window"])

strategy = utils.Strategy(**strategy_params)
strategy.plot()

In [53]:
dates = ("2019-06-13", "2019-06-20")
strategy.df_ticks.loc[slice(*dates)]

Unnamed: 0_level_0,adj_close,adj_close,volume,volume,adj_return,adj_return,med_dollar_volume,med_dollar_volume,position_size,position_size,rolling_adj_return,rolling_adj_return,spread
security,FCOM,VOX,FCOM,VOX,FCOM,VOX,FCOM,VOX,FCOM,VOX,FCOM,VOX,Unnamed: 13_level_1
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2019-06-13,32.092493,83.662981,36683.0,144139.0,0.012637,0.009671,2748885.0,12150610.0,857.0,329.0,-0.019172,-0.020907,-0.001735
2019-06-14,32.13181,83.820094,299374.0,40827.0,0.001224,0.001876,2748885.0,12150610.0,856.0,328.0,-0.004274,-0.00584,-0.001567
2019-06-17,32.450868,84.733317,69411.0,85353.0,0.009881,0.010836,2941990.0,12059100.0,907.0,347.0,0.006217,0.003948,-0.002269
2019-06-18,32.66259,85.224297,49125.0,147205.0,0.006503,0.005778,2941990.0,12059100.0,901.0,345.0,0.011501,0.009609,-0.001892
2019-06-19,32.770711,85.430509,36870.0,135738.0,0.003305,0.002417,2748885.0,12150610.0,839.0,322.0,0.02481,0.022081,-0.002729
2019-06-20,32.800199,85.901849,64990.0,181402.0,0.000899,0.005502,2252447.0,12150610.0,687.0,262.0,0.02488,0.026878,0.001998


In [54]:
df_positions = pd.DataFrame(strategy.closed_positions)
df_positions.open_date = pd.to_datetime(df_positions.open_date)
df_positions[df_positions.open_date.isin(pd.date_range(*dates))]

Unnamed: 0,position_type,open_date,security,shares,open_price,open_transact_cost,close_price,close_transact_cost,closed,close_date,transact_cost_per_share
32,PositionType.LONG,2019-06-13,VOX,329.0,83.662981,3.29,85.901849,3.29,True,2019-06-20,0.01
33,PositionType.SHORT,2019-06-13,FCOM,857.0,32.092493,8.57,32.800199,8.57,True,2019-06-20,0.01
34,PositionType.LONG,2019-06-20,FCOM,687.0,32.800199,6.87,32.954564,6.87,True,2019-06-24,0.01
35,PositionType.SHORT,2019-06-20,VOX,262.0,85.901849,2.62,85.912382,2.62,True,2019-06-24,0.01


## Metrics Derived from Return Series

### Load Fama-French Factor Returns
* [Description of Fama/French Factors](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_factors.html)


In [55]:
df_ff = pd.read_csv("F-F_Research_Data_5_Factors_2x3_daily_subset.csv", header=0, index_col=0)
df_ff.index = pd.to_datetime(df_ff.index, infer_datetime_format=False, format="%Y%m%d")
df_ff.index.name = "date"
df_ff.tail()

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RMW,CMA,RF
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-02-22,-1.12,0.68,3.14,1.66,0.9,0.0
2021-02-23,-0.15,-1.05,0.9,1.08,-0.19,0.0
2021-02-24,1.15,1.48,1.34,-0.29,0.32,0.0
2021-02-25,-2.73,-0.9,0.87,1.0,0.47,0.0
2021-02-26,-0.28,0.38,-1.56,-0.06,-0.38,0.0


* Is this the right ways to calculate risk free rate?

In [56]:
df_stats = pd.DataFrame(strategy.stats).set_index("date")
df_stats["rf"] = df_ff.RF / 100
df_stats

Unnamed: 0_level_0,realized_profit,unrealized_profit,total_profit,tick_profit,total_return,tick_return,rf
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-16,-11.040000,-11.040000,-22.080000,-22.080000,-0.000157,-0.000157,0.0001
2019-01-17,-11.040000,-28.254285,-39.294285,-17.214285,-0.000279,-0.000122,0.0001
2019-01-18,-11.040000,17.396207,6.356207,45.650492,0.000045,0.000324,0.0001
2019-01-22,-11.040000,-63.969409,-75.009409,-81.365616,-0.000532,-0.000577,0.0001
2019-01-23,-11.040000,26.125869,15.085869,90.095278,0.000107,0.000639,0.0001
...,...,...,...,...,...,...,...
2020-12-24,1409.372310,-9.650000,1399.722310,-19.300000,0.009883,-0.000137,0.0000
2020-12-28,1409.372310,25.110132,1434.482442,34.760132,0.010127,0.000247,0.0000
2020-12-29,1409.372310,27.756011,1437.128321,2.645879,0.010145,0.000019,0.0000
2020-12-30,1409.372310,18.863275,1428.235585,-8.892736,0.010083,-0.000063,0.0000


### Original Share Ratio
* This seems like an artificially high Sharpe Ratio - variance is low because of large number of days with no trading activity.
* Uses the Fama-French risk free rate

In [57]:
r_rf = df_stats.tick_return - df_stats.rf
mean = r_rf.mean()
var = df_stats.tick_return.var()
sharpe_orig =  mean / var
print('mean:', f"{mean:0.5f}")
print('var:', f"{var:0.5f}")
print('sharpe_orig:', f"{sharpe_orig:0.5f}")

mean: -0.00003
var: 0.00000
sharpe_orig: -432.29206


### Information Ratio
* Average daily returns on a buy and hold basis of the underlying securities as proxies for benchmarks.


In [58]:
rb = ((strategy.df_ticks["adj_return"] * strategy.df_ticks["volume"]).sum(axis=1)
        / strategy.df_ticks["volume"].sum(axis=1))
r_rb = df_stats.tick_return - rb
mean = r_rb.mean()
var = r_rb.var()
ratio =  mean / var
print('mean:', f"{mean:0.5f}")
print('var:', f"{var:0.5f}")
print('ratio:', f"{ratio:0.5f}")

mean: -0.00083
var: 0.00026
ratio: -3.16287


### Sortino Ratio
* Average daily returns on a buy and hold basis of the underlying securities as proxies for benchmarks.


In [59]:
mean = r_rb.mean()
var = r_rb[r_rb < 0].var()
ratio =  mean / var
print('mean:', f"{mean:0.5f}")
print('var:', f"{var:0.5f}")
print('ratio:', f"{ratio:0.5f}")

mean: -0.00083
var: 0.00011
ratio: -7.48586


### Beta and Downside Beta
* Uses Fama-French for market return and risk free rate
* Negative beta does not apparently occurr very often - not certain how to interpret or if this indicates an error in the calculation
* Downside beta has slightly higher magnitude the regular beta


In [60]:
m_rf = df_ff["Mkt-RF"][df_ff.index.isin(r_rf.index)] / 100
beta = r_rf.corr(m_rf)
beta

-0.23701789167801107

In [61]:
m_rf_downside = m_rf[m_rf < 0]
r_rf_downside = r_rf[r_rf.index.isin(m_rf_downside.index)]
downside_beta = r_rf_downside.corr(m_rf_downside)
downside_beta

-0.2786207159397842

* The magnitude of the market's returns is much greater than the strategy's returns.
* It does appear, also referencing back to the strategy plot above that there is negative correlation, i.e., the strategy made money during the period of market volatility in which the market returns were negative.
* Negative correlation is also evident in several other periods where the market had high volatility and negative returns

In [72]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=m_rf_downside.index, y=m_rf_downside, name="Downside Mkt-RF"), secondary_y=False)
fig.add_trace(go.Scatter(x=m_rf_downside.index, y=r_rf_downside, name="Returns"), secondary_y=True)
fig.update_layout(
    title_text="Analysis of Downside Beta"
)
fig.update_xaxes(title_text="date")
fig.update_yaxes(title_text="market", secondary_y=False)
fig.update_yaxes(title_text="strategy", secondary_y=True)
fig.update_layout(template="none")