# Futures Spreads

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
from collections import namedtuple
import hashlib
from functools import partial
import os
import sqlite3
from typing import Callable, Tuple

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import scipy as sp
from scipy.stats import norm, kurtosis
import quandl
from tqdm.notebook import trange, tqdm

from futures_spreads import utils

pd.options.plotting.backend = "plotly"

## Fetch Data from Quandl

[OptionWorks Futures Options](https://www.quandl.com/data/OWF-OptionWorks-Futures-Options/documentation)

Security   | Specifications
-----------|-------------------------------------------------------------------------------------------------------
CBT_FV_FV |[Five-Year T-Note Futures - Contract Specs](https://www.cmegroup.com/trading/interest-rates/us-treasury/5-year-us-treasury-note_contractSpecs_futures.html)
CBT_TY_TY  | [10-Year T-Note Futures - Contract Specs](https://www.cmegroup.com/trading/interest-rates/us-treasury/10-year-us-treasury-note_contractSpecs_futures.html)
ICE_B_B    | [Brent Crude Futures](https://www.theice.com/products/219/Brent-Crude-Futures)
ICE_G_G    | [Low Sulphur Gasoil Futures](https://www.theice.com/products/34361119/Low-Sulphur-Gasoil-Futures)

In [28]:
start_date = '2018-12-03'
end_date = '2020-08-31'

months = "HMUZ"
years = ["2019", "2020"]
exps = [f"{m}{y}" for y in years for m in months]
column_index = [1, 15, 16]

securities = [
    ("CBT_FV_FV", 1),
    ("CBT_TY_TY", 1),
    ("ICE_B_B", 1),
    ("ICE_G_G", 1/7.45)
    ]

sec_list = [utils.get_security_code(s[0], expiration=exp) for s in securities for exp in exps]

query_params = {
    "dataset": sec_list,
    "start_date": start_date, "end_date": end_date
}

In [32]:
data = utils.fetch_data(query_params)

Loading futures_spreads/data/770ea2fb11f83d0e51a45cb7eaf54370.csv from disk.


## Prepare Data

In [37]:
df_all = pd.DataFrame()
for c in data.columns:
    if c[-2:] not in ["_x", "_y"]:
        df_all = pd.concat([df_all, utils.expand_series(data[c])])

['Date']


IndexError: list index out of range

In [55]:
def set_cols(c):
    return c[0] if not c[1] else c[1]

df_g = df_all.groupby(df_all.columns[:-1].to_list()).max().unstack("series").reset_index()
df_g.columns = list(map(set_cols, df_g.columns))
df_g["multiplier"] = df_g["security"].map({s[0]: s[1] for s in securities})
df_g

Unnamed: 0,date,data_feed,security,expiration,model,dte,dtt,future,multiplier
0,2018-12-03,OWF,CBT_FV_FV,H2019,IVM,81.21,116.0,112.976562,1.000000
1,2018-12-03,OWF,CBT_FV_FV,H2020,IVM,,,,1.000000
2,2018-12-03,OWF,CBT_FV_FV,M2019,IVM,172.21,207.0,112.976562,1.000000
3,2018-12-03,OWF,CBT_FV_FV,M2020,IVM,,,,1.000000
4,2018-12-03,OWF,CBT_FV_FV,U2019,IVM,,,,1.000000
...,...,...,...,...,...,...,...,...,...
14491,2020-08-31,OWF,ICE_G_G,M2020,IVM,,,,0.134228
14492,2020-08-31,OWF,ICE_G_G,U2019,IVM,,,,0.134228
14493,2020-08-31,OWF,ICE_G_G,U2020,IVM,2.85,10.0,362.000000,0.134228
14494,2020-08-31,OWF,ICE_G_G,Z2019,IVM,,,,0.134228


## Load Data

In [12]:
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()

In [13]:
table_name = "futures_spreads"
cursor.execute(f"DROP TABLE IF EXISTS {table_name};").fetchone()

chunk_size = 20000
total = len(df_g)
n_chunks = (total // chunk_size + 1)
for i in trange(n_chunks):
    df_g.iloc[i * chunk_size:(i + 1) * chunk_size].to_sql(table_name, conn, method='multi', if_exists='append', index=False)

  0%|          | 0/1 [00:00<?, ?it/s]

## Query Second Month Contracts

In [56]:
df_second = pd.read_sql("""
    SELECT date, security, expiration, dtt, future * multiplier as adj_future FROM (
        SELECT *, rank() OVER w rank
        FROM futures_spreads
        WHERE dtt > 30
        WINDOW w AS (PARTITION BY security, date ORDER BY dte)
    ) WHERE rank = 1
""", conn)
df_second["date"] = pd.to_datetime(df_second["date"])
df_second = df_second.set_index(["date", "security"]).unstack("security")
df_second.tail(10)

Unnamed: 0_level_0,expiration,expiration,expiration,expiration,dtt,dtt,dtt,dtt,adj_future,adj_future,adj_future,adj_future
security,CBT_FV_FV,CBT_TY_TY,ICE_B_B,ICE_G_G,CBT_FV_FV,CBT_TY_TY,ICE_B_B,ICE_G_G,CBT_FV_FV,CBT_TY_TY,ICE_B_B,ICE_G_G
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2020-08-18,Z2020,U2020,Z2020,Z2020,135.0,34.0,73.0,114.0,125.929688,139.390625,46.49,52.718121
2020-08-19,U2020,U2020,Z2020,Z2020,42.0,33.0,72.0,113.0,125.835938,139.375,46.36,52.348993
2020-08-20,Z2020,U2020,Z2020,Z2020,133.0,32.0,71.0,112.0,126.0,139.5625,45.97,51.744966
2020-08-21,Z2020,Z2020,Z2020,Z2020,132.0,122.0,70.0,111.0,125.96875,139.4375,45.39,50.369128
2020-08-24,Z2020,Z2020,Z2020,Z2020,129.0,119.0,67.0,108.0,125.929688,139.390625,46.07,51.711409
2020-08-25,Z2020,Z2020,Z2020,Z2020,128.0,118.0,66.0,107.0,125.882812,139.171875,46.66,52.516779
2020-08-26,Z2020,Z2020,Z2020,Z2020,127.0,117.0,65.0,106.0,125.867188,139.171875,46.58,51.744966
2020-08-27,Z2020,Z2020,Z2020,Z2020,126.0,116.0,64.0,105.0,125.78125,138.828125,46.01,50.771812
2020-08-28,Z2020,Z2020,Z2020,Z2020,125.0,115.0,63.0,104.0,125.992188,139.09375,46.25,51.241611
2020-08-31,Z2020,Z2020,Z2020,Z2020,122.0,112.0,60.0,101.0,126.03125,139.25,45.66,50.738255


### Test Pairs

In [23]:
test_pairs = [
    ("ICE_B_B", "ICE_G_G", "2019-06-27", 14.799799),
    ("ICE_B_B", "ICE_G_G", "2019-08-13", 16.264966),
    ("CBT_FV_FV", "CBT_TY_TY", "2019-08-16", 11.421875),
    ("CBT_FV_FV", "CBT_TY_TY", "2019-08-23", 11.710938),
]
    
for p in test_pairs:
    s1, s2, date, value = p
    calc = df_second.loc[date, ("adj_future", s2)] - df_second.loc[date, ("adj_future", s1)]
    assert(abs(calc-value) < 1e-4), "Back to the tower Quasimodo..."

print("Money!")

Money!


## Analysis

In [42]:
pairs = [
    ("CBT_FV_FV", "CBT_TY_TY"),
    ("ICE_B_B", "ICE_G_G")
]

df_spreads = pd.concat([cuti.get_spread(pair, df_second) for pair in pairs], axis=1)

In [41]:
df_stats = pd.concat([
    df_spreads.describe(), 
    df_spreads.diff().describe(),
    df_spreads.pct_change().describe()
    ], axis=1)
tuples = [(label, col) for label in ["dollars", "diff", "pct_change"] for col in df_spreads.columns]
df_stats.columns = pd.MultiIndex.from_tuples(tuples, names=('label', 'security'))
df_stats 

label,dollars,dollars,diff,diff,pct_change,pct_change
security,CBT:TY-FV,ICE:G-B,CBT:TY-FV,ICE:G-B,CBT:TY-FV,ICE:G-B
count,452.0,450.0,450.0,446.0,452.0,452.0
mean,10.559372,12.921223,0.014358,-0.005517,0.001708,0.007518
std,2.195973,4.710429,0.209355,0.990879,0.018756,0.146384
min,6.617188,1.203826,-1.226562,-6.636174,-0.096922,-0.609314
25%,8.388672,9.611695,-0.085938,-0.577265,-0.008617,-0.043258
50%,10.652344,14.836208,0.015625,-0.037752,0.001459,-0.002759
75%,12.867188,16.181577,0.125,0.612987,0.011954,0.047999
max,14.1875,20.493221,1.0625,3.225034,0.087855,1.053632



#### Oveview

This begins the analysis and discussion of two futures spreads with the goal of determining whether either has the potential to be high quality. The spreads are based on the second month prices as calculated from the main quarterly contacts. The first spread is the 10-Year T-Note over the 5-Year T-Note (referred to as CBT:TY-FV throughout) and the Brent Crude over Low Sulphur Gasoil (ICE:G-B). The basis of d



As can be seen in the first chart below, we can't just look at the entire period because it includes the highly period involving the event of the Corona Virus pandemic. However, if we look at the periods before and after the pandemic period, we can determine whether either of those periods are 


Looking at the first chart below, we see that CBT:TY-FV generally has an upward trend throughout the period, while ICE:G-B was on a similar upward trend through the beginning of 2019Q4 and then dropped over 60% over the next four months. ICE:G-B appears to have leveled off and been more stable since 2020Q2.

Obviously the onset of the pandemic and resulting sharp drop in expected economic activity is a major outlier event that had a big impact on the dynamics of the spreads. In analyzing their historical performance, especially if our goal is to form a view on some range of potential future outcomes for the spreads, we need to consider how to treat this turbulent period in our analysis. A starting point is to be clear about our expectations regarding the likelihood of continuing to experience highly disruptive events with the potential to result in unacceptable losses. Going on the ideas presented in lecture, if the main goal of a spread trading strategy is to profit from relatively modest fluctuations about some mean, then if our expectations of continuing to experience turbulence of the magnitude we experienced at the beginning of 2020 are high enough, we may choose another strategy altogether.

However, if we expect that the current state of the world is somewhat more normal and we don't forsee any major shocks on the horizon, then it seems that we should exclude the turbulent period from our analysis. Perhaps, if we are more leary of big shocks, we may take that into account by setting more conservative stop loss limits and prioritizing the ability to unwind our positions quickly.

Also note that even though there appears to be much higher volatility with a much bigger change in the level of the spread for ICE:G-B, the same applies for CBT:TY-FV. The same macro conditions applied and you can see that it experienced a period of abnormal activity during March of 2020. If you look also at the charts of the underlying securities below, you can see that even though the spread remained relatively constant, both of the underlying securities experienced declines in excess of 50% from January to May of last year.

I think the fundamental question is:
* Are either of these spreads attractive for the purpose of developing a mean reversion strategy?
* 
* If we remove the outlier period and analyze the data from the remainder of the entire period are eitehr of these spreads attractive?
* If we analyze either the period before or the period after the outlier period are either of the spreads attractive?
* If the spread is not attractive in either the before or after period alone, then we can safely conclude that it is not attractive, since (i) the aggregation of two unattractive periods is unlikey to result in the combined period being attractive and (ii) given the highly unusual and episodic nature of the events that resulted in the turbulent period, even if it was attractive, we wouldn't expect it to occur again.


I would approach the analysis the following way:
* Start off by analyzing the period up through September of 2019
* Then analyze the period from June of 2020 to the end of the period
* If both of those periods exhibit similar characteristics, then that could be a basis for either combining them or relying on the more recent period for the developing a strategy.
* If the two periods have different characteristics, then that is a tougher call. On the one hand, the more recent period may be more relevant if the conclusion is that this is the new normal. On the other hand, though, we may conclude that the period we are in is still somewhat of an aberration and the historical period is a better representation of what to expect in the future. (To the extent that we did have specific expectations for the future, we may want to consider analyzing other historical periods with similar dynamics.)

#### CBT:TY-FY
This is the spread of the 2nd month futures contract prices (based on the main quarterly contracts) for the 10-Year T-Note and 5-Year T-Note. CBT:TY-FY exhibits an upward trend over the period, rising from \$6.6172 at the beginning of the period to \$13.1016 at the end of the period. There does appear to be a spike leading up to 2020-03-09. A similar spike can be observed around that time in the ICE:G-B spread. The mean daily dollar change was \$0.0144 and the mean daily percentage change was \0.0017. The daily dollar standard deviation was \$0.2094 and the standard deviation in daily percentage change terms was 0.0188.

#### ICE:G-B


In [47]:
fig = df_spreads.plot(title="Futures Spreads - 2nd Month",
              labels=dict(index="time", value="spread", variable="spreads"), template="none", width=1000, height=500)
fig.update_yaxes(tickprefix="$")
fig.show()

* Side by side underlying asset with spreads
* Side by side rolling averages versus original spread
* Median and standard deviation of difference to N-day rolling average
* Scatter plot of daily returns with normal distribution and kurtosis stats

In [48]:
cuti.make_spread_charts(pairs, df_second, title_text="Futures Spreads - 2nd Month", fig_size=dict(height=800, width=1200)).show()

In [50]:
cuti.make_tail_charts(
    pairs, df_second,
    title_text="2nd Month Futures Spreads - Distribution of Daily Returns",
    fig_size=dict(width=1200, height=500)
).show()

What would be interesting here would be to look at the distributions with the outlier event taken out.

In [90]:
kurtosis(df_spreads.loc[:"2019-12-31"][p1].pct_change().dropna())

0.9871648920258291

In [88]:
p1 = "CBT:TY-FV"
p2 = "ICE:G-B"
std = df_spreads[p1].pct_change().std()
demo = df_spreads[p1].pct_change()
(demo.abs() > 3 * std).sum()
mu, std = norm.fit(demo.dropna())
mu, std

(0.0017075365145233729, 0.01873568503043966)

In [79]:
spread = cuti.get_spread(pairs[0], df=df_second)
returns = pd.cut(spread.pct_change(), 100).value_counts().sort_index()
norm_dist = norm.pdf(returns.index.map(lambda interval: interval.mid).to_numpy(), loc=spread.pct_change().mean(), scale=spread.pct_change().std())
(norm_dist / norm_dist.sum()).sum()

1.0

In [57]:
fig = cuti.make_tail_charts(pairs, df_second.loc[:"2019-12-31"].copy(), title_text="2nd Month Futures Spreads - Distribution of Daily Returns")
fig.show()