# Futures Spreads

## Utility Functions

In [120]:
# <include-futures_spreads/utils.py>

## Imports

In [121]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [122]:
# <imports>
import sqlite3

import numpy as np
import pandas as pd
from tqdm.notebook import trange
from futures_spreads import utils

pd.options.plotting.backend = "plotly"

## Fetching Data from Quandle

[OptionWorks Futures Options](https://www.quandl.com/data/OWF-OptionWorks-Futures-Options/documentation)

Security   | Specifications
-----------|-------------------------------------------------------------------------------------------------------
CBT_FV_FV |[Five-Year T-Note Futures - Contract Specs](https://www.cmegroup.com/trading/interest-rates/us-treasury/5-year-us-treasury-note_contractSpecs_futures.html)
CBT_TY_TY  | [10-Year T-Note Futures - Contract Specs](https://www.cmegroup.com/trading/interest-rates/us-treasury/10-year-us-treasury-note_contractSpecs_futures.html)
ICE_B_B    | [Brent Crude Futures](https://www.theice.com/products/219/Brent-Crude-Futures)
ICE_G_G    | [Low Sulphur Gasoil Futures](https://www.theice.com/products/34361119/Low-Sulphur-Gasoil-Futures)

In [123]:
start_date = '2018-12-03'
end_date = '2020-08-31'

months = "HMUZ"
years = ["2019", "2020"]
exps = [f"{m}{y}" for y in years for m in months]
column_index = [1, 15, 16]

securities = [
    ("CBT_FV_FV", 1),
    ("CBT_TY_TY", 1),
    ("ICE_B_B", 1),
    ("ICE_G_G", 1/7.45)
    ]

sec_list = [utils.get_security_code(s[0], expiration=exp) for s in securities for exp in exps]

query_params = {
    "dataset": sec_list,
    "start_date": start_date, "end_date": end_date
}

In [124]:
data = utils.fetch_data(query_params, data_dir=".")

Loading futures_spreads/data/770ea2fb11f83d0e51a45cb7eaf54370.csv from disk.


## Data Preparation

Flatten the data.

In [125]:
df_all = pd.DataFrame()
for c in data.columns:
    if c[-2:] not in ["_x", "_y"]:
        df_all = pd.concat([df_all, utils.expand_series(data[c])])
df_all.head()

Unnamed: 0,date,data_feed,security,expiration,model,series,value
0,2018-12-03,OWF,CBT_FV_FV,H2019,IVM,future,112.976562
1,2018-12-04,OWF,CBT_FV_FV,H2019,IVM,future,113.15625
2,2018-12-05,OWF,CBT_FV_FV,H2019,IVM,future,113.15625
3,2018-12-06,OWF,CBT_FV_FV,H2019,IVM,future,113.421875
4,2018-12-07,OWF,CBT_FV_FV,H2019,IVM,future,113.695312


Use groupby to create rows that include dte, dtt and future.

In [126]:
def set_cols(c):
    return c[0] if not c[1] else c[1]

df_g = df_all.groupby(df_all.columns[:-1].to_list()).max().unstack("series").reset_index()
df_g.columns = list(map(set_cols, df_g.columns))
df_g["multiplier"] = df_g["security"].map({s[0]: s[1] for s in securities})
df_g.dropna().tail()

Unnamed: 0,date,data_feed,security,expiration,model,dte,dtt,future,multiplier
14471,2020-08-31,OWF,CBT_FV_FV,Z2020,IVM,81.21,122.0,126.03125,1.0
14479,2020-08-31,OWF,CBT_TY_TY,Z2020,IVM,81.21,112.0,139.25,1.0
14487,2020-08-31,OWF,ICE_B_B,Z2020,IVM,56.85,60.0,45.66,1.0
14493,2020-08-31,OWF,ICE_G_G,U2020,IVM,2.85,10.0,362.0,0.134228
14495,2020-08-31,OWF,ICE_G_G,Z2020,IVM,93.85,101.0,378.0,0.134228


## Load Data

In [127]:
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()

In [128]:
table_name = "futures_spreads"
cursor.execute(f"DROP TABLE IF EXISTS {table_name};").fetchone()

chunk_size = 20000
total = len(df_g)
n_chunks = (total // chunk_size + 1)
for i in trange(n_chunks):
    df_g.iloc[i * chunk_size:(i + 1) * chunk_size].to_sql(table_name, conn, method='multi', if_exists='append', index=False)

  0%|          | 0/1 [00:00<?, ?it/s]

## Select Second Month Contracts

In [129]:
df_second = pd.read_sql("""
    SELECT date, security, expiration, dtt, future * multiplier as adj_future FROM (
        SELECT *, rank() OVER w rank
        FROM futures_spreads
        WHERE dtt > 30
        WINDOW w AS (PARTITION BY security, date ORDER BY dte)
    ) WHERE rank = 1
""", conn)
df_second["date"] = pd.to_datetime(df_second["date"])
df_second = df_second.set_index(["date", "security"]).unstack("security")
df_second.tail(10)

Unnamed: 0_level_0,expiration,expiration,expiration,expiration,dtt,dtt,dtt,dtt,adj_future,adj_future,adj_future,adj_future
security,CBT_FV_FV,CBT_TY_TY,ICE_B_B,ICE_G_G,CBT_FV_FV,CBT_TY_TY,ICE_B_B,ICE_G_G,CBT_FV_FV,CBT_TY_TY,ICE_B_B,ICE_G_G
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2020-08-18,Z2020,U2020,Z2020,Z2020,135.0,34.0,73.0,114.0,125.929688,139.390625,46.49,52.718121
2020-08-19,U2020,U2020,Z2020,Z2020,42.0,33.0,72.0,113.0,125.835938,139.375,46.36,52.348993
2020-08-20,Z2020,U2020,Z2020,Z2020,133.0,32.0,71.0,112.0,126.0,139.5625,45.97,51.744966
2020-08-21,Z2020,Z2020,Z2020,Z2020,132.0,122.0,70.0,111.0,125.96875,139.4375,45.39,50.369128
2020-08-24,Z2020,Z2020,Z2020,Z2020,129.0,119.0,67.0,108.0,125.929688,139.390625,46.07,51.711409
2020-08-25,Z2020,Z2020,Z2020,Z2020,128.0,118.0,66.0,107.0,125.882812,139.171875,46.66,52.516779
2020-08-26,Z2020,Z2020,Z2020,Z2020,127.0,117.0,65.0,106.0,125.867188,139.171875,46.58,51.744966
2020-08-27,Z2020,Z2020,Z2020,Z2020,126.0,116.0,64.0,105.0,125.78125,138.828125,46.01,50.771812
2020-08-28,Z2020,Z2020,Z2020,Z2020,125.0,115.0,63.0,104.0,125.992188,139.09375,46.25,51.241611
2020-08-31,Z2020,Z2020,Z2020,Z2020,122.0,112.0,60.0,101.0,126.03125,139.25,45.66,50.738255


### Test Pairs

In [130]:
test_pairs = [
    ("ICE_B_B", "ICE_G_G", "2019-06-27", 14.799799),
    ("ICE_B_B", "ICE_G_G", "2019-08-13", 16.264966),
    ("CBT_FV_FV", "CBT_TY_TY", "2019-08-16", 11.421875),
    ("CBT_FV_FV", "CBT_TY_TY", "2019-08-23", 11.710938),
]
    
for p in test_pairs:
    s1, s2, date, value = p
    calc = df_second.loc[date, ("adj_future", s2)] - df_second.loc[date, ("adj_future", s1)]
    assert(abs(calc-value) < 1e-4), "Back to the tower Quasimodo..."

print("All Good!")

All Good!


## Analysis

The two futures spreads analyzed are:
* [10-Year T-Note Futures](https://www.cmegroup.com/trading/interest-rates/us-treasury/10-year-us-treasury-note_contractSpecs_futures.html) over the [Five-Year T-Note Futures](https://www.cmegroup.com/trading/interest-rates/us-treasury/5-year-us-treasury-note_contractSpecs_futures.html) (CBT:TY-FV throughout)
* [Brent Crude Futures](https://www.theice.com/products/219/Brent-Crude-Futures) over [Low Sulphur Gasoil Futures](https://www.theice.com/products/34361119/Low-Sulphur-Gasoil-Futures) (ICE:G-B throughout)

The prices are calculated using second month contracts (earliest expiration greater than 30 days) using quarterly contract expirations for the period from 2018-12-03 to 2020-08-31. Discussion and analysis of the following exhibits is included:
* Spread charts
* Summary statistics
* Histograms of daily returns compared to normal distributions
* Q-Q plots
* Rolling kurtosis charts
* Statistics of rolling average differences

In [131]:
pairs = [
    ("CBT_FV_FV", "CBT_TY_TY"),
    ("ICE_B_B", "ICE_G_G")
]

df_spreads = pd.concat([utils.get_spread(pair, df_second) for pair in pairs], axis=1)

### Spread Charts
A significant feature of both sets of charts is the relatively large change in performance that occurs beginning in the first quarter of 2020, presumably as a result of the pandemic. The effect is seen more clearly in the ICE:G-B spread than it is in the CBT:TY-FV spread, but is evident in the CBT:TY-FV spread nonetheless, and clearly evident in all of the underlying securities. If the objective of this analysis was to form a basis for a reversion to the mean trading strategy and the there was no expectation of there being similar shocks to the system during our investment horizon, it may be appropriate to exclude this period from the analysis.

In [132]:
utils.make_spread_charts(
    pairs, df_second,
    title_text="Futures Spreads - Second Month",
    fig_size=dict(width=1000, height=800)
)

### Summary Statistics
Across the entire period, CBT:TY-FV had a mean daily return of 0.001511 with a standard deviation of 0.018785. ICE:G-B was much more volatile, with a standard deviation of 0.13972 and a mean of -0.000871. That additional volatility can also be seen in the relative minimum and maximum daily returns. Whereas CBT:TY-FV had a min and max of -0.101946 and 0.084208, respectively, ICE:G-B had a min and max of -0.939851 and 0.719610, respectively. The range of ICE:G-B is 1.659461, which is 8.9 times greater than the range of 0.186154 for CBT:TY-FV. The range between the 25th percentile and the 75th percentile for ICE:G-B is still 4.4 times that of CBT:TY-FV. One last point to note is that ICE:G-B had only a slightly negative mean, even though it declined significantly over the period. This indicates there were a small number of observations with relatively large negative returns. For example, as can be seen in the chart above, ICE:G-B declined 51.0%, from $13.009530 on 2020-03-30 to $6.373356 on 2020-03-31.


In [133]:
df_stats = pd.concat([
    df_spreads.describe(), 
    df_spreads.diff().describe(),
    (df_spreads / df_spreads.shift(1)).apply(np.log).describe()
    ], axis=1)

columns = ["dollars", "diff", "return"]
tuples = [(label, col) for label in  columns for col in df_spreads.columns]
df_stats.columns = pd.MultiIndex.from_tuples(tuples, names=('label', 'security'))
df_stats

label,dollars,dollars,diff,diff,return,return
security,CBT:TY-FV,ICE:G-B,CBT:TY-FV,ICE:G-B,CBT:TY-FV,ICE:G-B
count,452.0,450.0,450.0,446.0,450.0,446.0
mean,10.559372,12.921223,0.014358,-0.005517,0.001511,-0.000871
std,2.195973,4.710429,0.209355,0.990879,0.018785,0.139732
min,6.617188,1.203826,-1.226562,-6.636174,-0.101946,-0.939851
25%,8.388672,9.611695,-0.085938,-0.577265,-0.008696,-0.044117
50%,10.652344,14.836208,0.015625,-0.037752,0.001458,-0.002937
75%,12.867188,16.181577,0.125,0.612987,0.01186,0.047125
max,14.1875,20.493221,1.0625,3.225034,0.084208,0.71961


### Distributions
These charts were constructed by creating a histogram of daily returns and then normalizing them to 100%. The normal distributions shown for comparison have the same parameters as the spread distributions and are similarly normalized to 100%. Summary statistics are also provided for reference, with the addition of skewness and kurtosis (normal = 0).

#### Entire Period
The distribution of daily returns for CBT:TY-FV does not exhibit a high degree of skewness and is slightly leptokurtic, with excess kurtosis of 5.0569. The normal distribution appears to fit reasonably well. The distribution of daily returns for ICE:G-B is much more peaked than that of CBT:TY-FV, with excess kurtosis of 13.0278.

In [134]:
utils.make_tail_charts(
    pairs, df_second,
    title_text="Futures Spreads - Second Month - Distribution of Daily Log Returns",
    return_type='log',
    moments_xanchors=("left", "left")
)

#### Excluding Extraordinary Period
However, if we examine the distributions up to the end of 2019, before the onset of the pandemic, the returns for both spreads appear to be much more normally distributed and while both are still leptokurtic, kurtosis is lower for both. CBT:TY-FV has excess kurtosis of 0.8416 and ICE:G-B has excess kurtosis of 3.8181. The standard deviation of CBT:TY-FV is slightly decreased, from 0.0188 to 0.0166, whereas the standard deviation of ICE:G-B has decreased significantly, from 0.1469 to 0.0597.

In [135]:
utils.make_tail_charts(
    pairs, df_second.loc[:"2019-12-31"],
    title_text="Futures Spreads - Second Month - Distribution of Daily Returns",
    moments_xanchors=("right", "left")
)

Another potentially useful analysis is to examine the distributions before and after the potentially extraordinary period. If both periods have similar characteristics, that may make increase confidence in our expectations of the future. If they have different characteristics, that may decrease our confidence in our expectations of the future since there are various potential rationales for the difference, including (i) the more recent period is the new normal, (ii) the period prior to the extraordinary period is normal and the market has yet to return to normal or (iii) neither period is normal and the market is still transitioning to the new normal. Reviewing a longer history as well as more recent data may help increase confidence in our expectations.

When we look at the distribution of CBT:TY-FV for the period from 2020-06-01 to 2020-08-31 along side the distribution for the period from 2018-12-03 to 2019-12-31, we can see that distributions from the two different period do in fact have similar characteristics with respect to standard deviation, skewness and kurtosis.

In [136]:
utils.make_tail_charts(
    (pairs[0], pairs[0]), df_second,
    title_text="Futures Spreads - Second Month - Distribution of Daily Returns",
    date_slices=(slice(None, "2019-12-31"), slice("2020-06-01", None)),
)

However, when we look at the distributions for ICE:G-B for the tow different periods, we see that there is quite a bit more variance in the more recent period, but similar kurtosis.

In [137]:
utils.make_tail_charts(
    (pairs[1], pairs[1]), df_second,
    title_text="Futures Spreads - Second Month - Distribution of Daily Returns",
    date_slices=(slice(None, "2019-12-31"), slice("2020-06-01", None)),
    moments_xanchors=("left", "right")
)

### Q-Q Plots
To gain more insight into the ocurruence of outliers in the distributions of returns, we can examine Q-Q plots. These plots are prepared by sorting normalized daily returns in ascencing order and plotting them against an equivalent number of divisions of the standard normal distribution from p = 0.001 to p = 0.999.

#### Entire Period
Here we can see that both distributions have some sinificant outliers, ICE:G-B to a greater extent, with two observations greater than seven standard deviations from the mean. It is also interesting to note that within approximately 1.5 standard deviations of the mean, both distributions appear to be relatively normal.

In [138]:
utils.make_qq_charts(
    pairs, df_second,
    "Q-Q Plots"
)

#### Excluding Extraordinary Period
Here are the same plots including just the period up to the end of 2019. While there are still some outliers, there are far fewer of them and the distributions appear to be normal. The reduction in the quantity and magnitude of the outliers relative to the plot of the entire period appears to be consistent with the reduction in calculated kurtosis.

In [139]:
utils.make_qq_charts(
    pairs, df_second.loc[:"2019-12-31"],
    "Q-Q Plots"
)

There are relatively few obserations in the more recent period, but for CBT:TY-FV, even though there appear to be a few outliers, they do not appear to be significant outliers, all within three standard deviations of the mean.

In [140]:
utils.make_qq_charts(
    (pairs[0], pairs[0]), df_second,
    "Q-Q Plots",
    date_slices=(slice(None, "2019-12-31"), slice("2020-06-01", None)),
)

 We can also see that the relatively high excess kurtosis of 6.5051 for ICE:G-B is likey due to the most extreme outlier at 4.0 standard deviations from the mean.

In [141]:
utils.make_qq_charts(
    (pairs[1], pairs[1]), df_second,
    "Q-Q Plots",
    date_slices=(slice(None, "2019-12-31"), slice("2020-06-01", None)),
)

## Rolling Kurtosis
For a final evaluation of kurtosis, we examine kurtosis on a rolling basis over various window lengths. Over the shorter windows, kurtosis for both distributions is lower than it is over the longer windows, with an average of 0.43467 for CBT:TY-FV and 0.58609 for ICE:G-B for the 30 day window. This would seem to be because there are fewer opportunities for outliers to occurr over shorter periods of time and hence may not be a useful characteristic on its own. However, the fact that the higher kurtosis over longer periods appears to be as a result of a smaller number of outliers of a large magnitude, may indicate that it is possible to develop a good strategy if the outliers can be neutralized with stop loss limits or otherwise.

In [142]:
df_kurts = utils.get_rolling_kurts(pairs, df_second)
df_kurts.tail()

spread,CBT:TY-FV,CBT:TY-FV,CBT:TY-FV,CBT:TY-FV,ICE:G-B,ICE:G-B,ICE:G-B,ICE:G-B
window,30,90,180,360,30,90,180,360
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2020-08-25,1.397219,0.576965,6.732732,5.636996,3.897592,2.884149,5.246334,11.547127
2020-08-26,1.477641,0.663195,6.930461,5.637989,3.759802,3.19413,5.211892,11.493766
2020-08-27,1.001146,0.532799,6.821145,5.596871,3.791985,3.551272,5.207769,11.476436
2020-08-28,1.047394,0.52918,6.90507,5.605851,3.651172,3.021644,5.203611,11.472453
2020-08-31,0.873001,0.610804,6.89687,5.601522,3.681314,3.224584,5.204765,11.471913


In [143]:
utils.make_rolling_charts(df_kurts, "Rolling Kurtosis")

## Rolling Average Differences
The last analysis is of statistics of the differences between the spread daily returns and rolling averages over various window lengths.

In [144]:
df_diffs = utils.get_rolling_avg_diffs(pairs, df_second)
df_diffs.describe()

spread,CBT:TY-FV,CBT:TY-FV,CBT:TY-FV,CBT:TY-FV,ICE:G-B,ICE:G-B,ICE:G-B,ICE:G-B
window,30,90,180,360,30,90,180,360
count,422.0,362.0,272.0,92.0,420.0,360.0,270.0,90.0
mean,-0.000154,-0.000237,-0.0009,-0.001972,-7e-05,-0.000147,-0.001526,0.018242
std,0.018563,0.019262,0.020501,0.011987,0.140666,0.152194,0.173605,0.231852
min,-0.10161,-0.102458,-0.102479,-0.030846,-0.877348,-0.914623,-0.927382,-0.844486
25%,-0.010608,-0.010254,-0.010937,-0.007972,-0.041194,-0.047763,-0.059288,-0.098296
50%,0.000302,0.000333,-0.000757,-0.001579,-0.003982,-0.004259,-0.005772,0.000216
75%,0.01007,0.01012,0.009246,0.005383,0.043124,0.049741,0.056628,0.092203
max,0.079267,0.081435,0.082706,0.031893,0.762382,0.734602,0.725851,0.7219


In [145]:
utils.make_rolling_charts(df_diffs, "Differences to Rolling Averages - Daily Returns", fig_size=dict(height=1000, width=1000))