# Rolling Regressions

## Imports

In [1]:
# <include-rolling_regressions/utils.py>

In [27]:
# <imports>
import numpy as np
import pandas as pd
import plotly.io as pio
from patsy import dmatrices
import statsmodels.api as sm

from rolling_regressions import utils

pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

In [3]:
if False:
    df_prices = utils.fetch_all_tickers(["SPY", "SUN", "IBM", "MSFT", "AAPL"], query_params={"start_date": "2016-01-01", "end_date": "2020-12-31"})
    df_prices.to_csv("df_prices.csv")
df_prices = pd.read_csv("df_prices.csv").set_index(["ticker", "date"])

## Summary

## Analysis

Here we specify times in terms of numbers of days (which we take to be numbers of observations ignoring differences in duration between observations) and the calculate chacteristic time, $\lambda$ as $\frac{1}{\text{days}}$. We start out with inifinite series not correcting for the long small tails of the weights, even for the shorter time periods. The chart below is a bit of a simplification in that it normalizes the weights based on their sum through 360 days, excluding the weights still applicable to observations beyond 360 days, but since they are small, doing so does not alter the point of the chart. The main point is that observations well past the time $t$ continue to impact the moving average. It appears that about 65% of the statistic is comprised as a function of observations up to to $t$ with the remaining 35% coming from after that. Even at a boxcar window of $\frac{2}{\lambda}$ for 90 days results in a window of 180 days, but from the chart below that only accounts for approximately 88% of the statistic with another 12% coming from observations after 180 days.

### Potential Analyses
* Different box car window periods relative to the same characteristic times

In [4]:
times = np.array([5, 10, 30, 60, 90])
lambdas = 1 / times

In [28]:
nobs = 360
fig = utils.go.Figure()
for t in times:
    weights = np.power(1 - 1/t, np.arange(nobs))
    weights = weights / weights.sum()
    fig.add_scatter(x=np.arange(nobs), y=pd.Series(weights).cumsum(), name=f"t = {t}")
fig.update_layout(title_text="Weights by Time Period")
fig.show()

In [6]:
df_ret = np.log(df_prices.adj_close.unstack("ticker") / df_prices.adj_close.unstack("ticker").shift())
df_ret.head()

ticker,AAPL,IBM,MSFT,SPY,SUN
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-04,,,,,
2016-01-05,-0.025379,-0.000736,0.004552,0.00169,-0.001264
2016-01-06,-0.019764,-0.005018,-0.018332,-0.012694,-0.03108
2016-01-07,-0.043121,-0.017237,-0.035402,-0.024284,-0.027508
2016-01-08,0.005274,-0.009301,0.003062,-0.011037,-0.016221


In [86]:
df_ret.shift(0).head(10)

ticker,AAPL,IBM,MSFT,SPY,SUN
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-04,,,,,
2016-01-05,-0.025379,-0.000736,0.004552,0.00169,-0.001264
2016-01-06,-0.019764,-0.005018,-0.018332,-0.012694,-0.03108
2016-01-07,-0.043121,-0.017237,-0.035402,-0.024284,-0.027508
2016-01-08,0.005274,-0.009301,0.003062,-0.011037,-0.016221
2016-01-11,0.016063,0.012082,-0.000573,0.00099,-0.046585
2016-01-12,0.014409,-0.00248,0.009136,0.008036,-0.012644
2016-01-13,-0.026047,-0.013103,-0.021836,-0.025257,-0.041027
2016-01-14,0.021635,0.013178,0.028069,0.016284,0.038131
2016-01-15,-0.024308,-0.021907,-0.040736,-0.0217,0.012107


In [141]:
df_ret.rolling(window=5).cov().groupby("ticker").shift(-5)

Unnamed: 0_level_0,ticker,AAPL,IBM,MSFT,SPY,SUN
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,AAPL,0.000571,0.000184,0.000289,0.000151,-0.000161
2016-01-04,IBM,0.000184,0.000118,0.000119,0.000103,-0.000069
2016-01-04,MSFT,0.000289,0.000119,0.000296,0.000161,0.000092
2016-01-04,SPY,0.000151,0.000103,0.000161,0.000116,0.000020
2016-01-04,SUN,-0.000161,-0.000069,0.000092,0.000020,0.000287
...,...,...,...,...,...,...
2020-12-31,AAPL,,,,,
2020-12-31,IBM,,,,,
2020-12-31,MSFT,,,,,
2020-12-31,SPY,,,,,


In [142]:
col_list = []
for t in times:
    for win_type, df_cov in {"ewm": df_ret.ewm(alpha=1/t).cov(), "boxcar": df_ret.rolling(window=2*t).cov(), "boxcar_fwd": df_ret.rolling(window=5).cov().groupby("ticker").shift(-5)}.items():
        if not (win_type == "boxcar_fwd" and t > times[0]):
            s_var = pd.Series((np.concatenate(np.repeat(np.expand_dims(np.eye(len(df_ret.columns)), axis=0), len(df_ret.index), axis=0), axis=0) * df_cov).sum(axis=1), name=("var_x", win_type, f"t_{t:02d}"))
            s_cov = df_cov["SPY"]
            s_cov.name = ("cov_xy", win_type, f"t_{t:02d}")
            col_list.extend([s_var, s_cov])

df_vars = pd.concat(col_list, axis=1)
df_vars = df_vars.loc[df_vars.index.get_level_values("ticker") != "SPY"]
df_vars.columns.names = ["stat", "win_type", "time"]

df_beta = df_vars["cov_xy"].divide(df_vars["var_x"])
df_beta.columns = pd.MultiIndex.from_tuples([("beta_1", *c) for c in df_beta.columns], names=["stat", "win_type", "time"])
df_betas = pd.concat([df_vars, df_beta], axis=1)
df_betas = df_betas.swaplevel("win_type", "stat", axis=1)
df_betas = df_betas.sort_index(axis=1)
df_betas.index = df_betas.index.swaplevel()
df_betas = df_betas.sort_index()
df_betas

Unnamed: 0_level_0,win_type,boxcar,boxcar,boxcar,boxcar,boxcar,boxcar,boxcar,boxcar,boxcar,boxcar,...,ewm,ewm,ewm,ewm,ewm,ewm,ewm,ewm,ewm,ewm
Unnamed: 0_level_1,stat,beta_1,beta_1,beta_1,beta_1,beta_1,cov_xy,cov_xy,cov_xy,cov_xy,cov_xy,...,cov_xy,cov_xy,cov_xy,cov_xy,cov_xy,var_x,var_x,var_x,var_x,var_x
Unnamed: 0_level_2,time,t_05,t_10,t_30,t_60,t_90,t_05,t_10,t_30,t_60,t_90,...,t_05,t_10,t_30,t_60,t_90,t_05,t_10,t_30,t_60,t_90
ticker,date,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
AAPL,2016-01-04,,,,,,,,,,,...,,,,,,0.000000,0.000000,0.000000,0.000000,0.000000
AAPL,2016-01-05,,,,,,,,,,,...,,,,,,0.000000,0.000000,0.000000,0.000000,0.000000
AAPL,2016-01-06,,,,,,,,,,,...,-4.038390e-05,-0.000040,-0.000040,-0.000040,-0.000040,0.000016,0.000016,0.000016,0.000016,0.000016
AAPL,2016-01-07,,,,,,,,,,,...,1.204272e-04,0.000114,0.000110,0.000109,0.000109,0.000168,0.000158,0.000152,0.000150,0.000150
AAPL,2016-01-08,,,,,,,,,,,...,1.000488e-04,0.000088,0.000082,0.000080,0.000080,0.000496,0.000444,0.000414,0.000407,0.000405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SUN,2020-12-24,0.155685,0.124589,0.286701,0.254559,0.275134,0.000032,0.000035,0.000105,0.000090,0.000150,...,2.131667e-05,0.000043,0.000092,0.000182,0.000265,0.000177,0.000277,0.000383,0.000619,0.000854
SUN,2020-12-28,0.203235,0.113951,0.279726,0.250349,0.273393,0.000034,0.000032,0.000103,0.000089,0.000150,...,5.564877e-06,0.000031,0.000086,0.000177,0.000261,0.000154,0.000260,0.000376,0.000612,0.000846
SUN,2020-12-29,0.099479,0.094289,0.279364,0.248957,0.266273,0.000012,0.000020,0.000103,0.000088,0.000141,...,-2.690544e-06,0.000026,0.000083,0.000174,0.000258,0.000139,0.000238,0.000364,0.000602,0.000837
SUN,2020-12-30,0.034810,0.062681,0.268901,0.247036,0.266032,0.000005,0.000013,0.000098,0.000087,0.000141,...,-2.420149e-06,0.000023,0.000080,0.000171,0.000255,0.000144,0.000228,0.000354,0.000593,0.000829


In [182]:
df_betas[("boxcar_fwd", "beta_1", f"t_{time:02d}")].loc[ticker]

ticker  date      
AAPL    2016-01-04    0.265313
        2016-01-05    0.446770
        2016-01-06    0.510665
        2016-01-07    0.798343
        2016-01-08    0.761398
                        ...   
        2020-12-24         NaN
        2020-12-28         NaN
        2020-12-29         NaN
        2020-12-30         NaN
        2020-12-31         NaN
Name: (boxcar_fwd, beta_1, t_05), Length: 1259, dtype: float64

In [181]:
ticker = ["AAPL"]
time = times[0]
fig = utils.go.Figure()
fig.add_histogram(x=df_betas[("boxcar", "beta_1", f"t_{time:02d}")].loc[ticker].values, histnorm="percent", name="boxcar", opacity=0.7)
fig.add_histogram(x=df_betas[("boxcar_fwd", "beta_1", "t_05")].loc[ticker].values, histnorm="percent", name="boxcar_fwd", opacity=0.7)
fig.add_histogram(x=df_betas[("ewm", "beta_1", f"t_{time:02d}")].loc[ticker].values, histnorm="percent", name="ewm", opacity=0.7)
fig.update_layout(
    title_text=("Coefficients Histograms"),
    font=dict(size=10),
    margin=dict(l=50, r=10, b=40, t=90),
    barmode="overlay",
    yaxis_title="% of all coefficients",
    xaxis_title="coefficient",
)

In [173]:
df_betas[[("boxcar", "beta_1", f"t_{time:02d}"), ("boxcar_fwd", "beta_1", "t_05"), ("ewm", "beta_1", f"t_{time:02d}")]].stack("win_type")

Unnamed: 0_level_0,Unnamed: 1_level_0,stat,beta_1
Unnamed: 0_level_1,Unnamed: 1_level_1,time,t_05
ticker,date,win_type,Unnamed: 3_level_2
AAPL,2016-01-04,boxcar_fwd,0.265313
AAPL,2016-01-05,boxcar_fwd,0.446770
AAPL,2016-01-06,boxcar_fwd,0.510665
AAPL,2016-01-06,ewm,-2.561786
AAPL,2016-01-07,boxcar_fwd,0.798343
...,...,...,...
SUN,2020-12-29,ewm,-0.019415
SUN,2020-12-30,boxcar,0.034810
SUN,2020-12-30,ewm,-0.016763
SUN,2020-12-31,boxcar,0.087161


In [172]:
df_betas[[("boxcar", "beta_1", f"t_{time:02d}"), ("boxcar_fwd", "beta_1", "t_05"), ("ewm", "beta_1", f"t_{time:02d}")]].loc[ticker].dropna().describe()

win_type,boxcar,boxcar_fwd,ewm
stat,beta_1,beta_1,beta_1
time,t_05,t_05,t_05
count,3732.0,3732.0,3732.0
mean,0.334711,0.340386,0.329256
std,0.279123,0.383414,0.26588
min,-0.295087,-2.457226,-0.409138
25%,0.117025,0.088176,0.121776
50%,0.295238,0.312252,0.292348
75%,0.53265,0.582768,0.509478
max,1.506663,2.700635,1.546417


In [90]:
df_vars

Unnamed: 0_level_0,stat,var_x,cov_xy,var_x,cov_xy,var_x,cov_xy,var_x,cov_xy,var_x,cov_xy,...,var_x,cov_xy,var_x,cov_xy,var_x,cov_xy,var_x,cov_xy,var_x,cov_xy
Unnamed: 0_level_1,win_type,ewm,ewm,boxcar,boxcar,boxcar_fwd,boxcar_fwd,ewm,ewm,boxcar,boxcar,...,boxcar,boxcar,ewm,ewm,boxcar,boxcar,ewm,ewm,boxcar,boxcar
Unnamed: 0_level_2,time,t_05,t_05,t_05,t_05,t_05,t_05,t_10,t_10,t_10,t_10,...,t_30,t_30,t_60,t_60,t_60,t_60,t_90,t_90,t_90,t_90
date,ticker,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
2016-01-04,AAPL,0.000000,,0.000000,,0.0,,0.000000,,0.000000,,...,0.000000,,0.000000,,0.000000,,0.000000,,0.000000,
2016-01-04,IBM,0.000000,,0.000000,,0.0,,0.000000,,0.000000,,...,0.000000,,0.000000,,0.000000,,0.000000,,0.000000,
2016-01-04,MSFT,0.000000,,0.000000,,0.0,,0.000000,,0.000000,,...,0.000000,,0.000000,,0.000000,,0.000000,,0.000000,
2016-01-04,SUN,0.000000,,0.000000,,0.0,,0.000000,,0.000000,,...,0.000000,,0.000000,,0.000000,,0.000000,,0.000000,
2016-01-05,AAPL,0.000000,,0.000000,,0.0,,0.000000,,0.000000,,...,0.000000,,0.000000,,0.000000,,0.000000,,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-30,SUN,0.000144,-2.420149e-06,0.000145,0.000005,0.0,,0.000228,0.000023,0.000211,0.000013,...,0.000363,0.000098,0.000593,0.000171,0.000353,0.000087,0.000829,0.000255,0.000528,0.000141
2020-12-31,AAPL,0.000323,3.956640e-05,0.000307,0.000028,0.0,,0.000328,0.000053,0.000309,0.000055,...,0.000447,0.000133,0.000554,0.000203,0.000654,0.000185,0.000611,0.000264,0.000532,0.000189
2020-12-31,IBM,0.000083,2.474553e-05,0.000076,0.000019,0.0,,0.000122,0.000036,0.000145,0.000036,...,0.000377,0.000087,0.000333,0.000147,0.000261,0.000088,0.000396,0.000208,0.000343,0.000162
2020-12-31,MSFT,0.000095,1.142266e-05,0.000093,0.000004,0.0,,0.000116,0.000020,0.000109,0.000012,...,0.000261,0.000116,0.000358,0.000182,0.000372,0.000155,0.000437,0.000247,0.000363,0.000178


As a check to makes sure the that the calculations appear to be being performed correctly we can plot one ticker, SPY and the variance and covariance. This looks like its doing the right thing - the 90 day moving average with much more muted responses to changes in AAPL variance.

In [30]:
fig = utils.make_subplots(specs=[[{"secondary_y": True}]])
fig.add_scatter(x=df_betas.index.levels[1], y=df_prices.loc["AAPL"].adj_close, name="AAPL")
fig.add_scatter(x=df_betas.index.levels[1], y=df_betas.loc["AAPL"].var_x.t_05, name="t_05", secondary_y=True)
fig.add_scatter(x=df_betas.index.levels[1], y=df_betas.loc["AAPL"].var_x.t_90, name="t_90", secondary_y=True)
fig.update_layout(title="EWM Variance", showlegend=True)

Covariance appears to be working as well.

In [38]:
fig = utils.make_subplots(specs=[[{"secondary_y": True}]])
fig.add_scatter(x=df_betas.index.levels[1], y=df_prices.loc["AAPL"].adj_close / df_prices.loc["AAPL"].adj_close.iloc[0] * 100, name="AAPL") 
fig.add_scatter(x=df_betas.index.levels[1], y=df_prices.loc["SPY"].adj_close / df_prices.loc["SPY"].adj_close.iloc[0] * 100, name="SPY", line=dict(color=utils.COLORS[3]))
fig.add_scatter(x=df_betas.index.levels[1], y=df_betas.loc["AAPL"].cov_xy.t_05, name="cov_xy.t_05", secondary_y=True, line=dict(color=utils.COLORS[1]))
fig.add_scatter(x=df_betas.index.levels[1], y=df_betas.loc["AAPL"].cov_xy.t_90, name="cov_xy.t_90", secondary_y=True, line=dict(color=utils.COLORS[2]))
fig.update_layout(title="EWM Covariance", showlegend=True)

## Box Car Coefficients

This can be simplified with the calculation of ewma above by just calcing the var and covs and the rest is the same.

In [46]:
col_list = []
for t in times:
    s_var = pd.Series((np.concatenate(np.repeat(np.expand_dims(np.eye(len(df_ret.columns)), axis=0), len(df_ret.index), axis=0), axis=0) * df_ret.rolling(window=2 * t).cov()).sum(axis=1), name=("var_x", f"t_{t:02d}"))
    s_cov = df_ret.rolling(window=2 * t).cov()["SPY"]
    s_cov.name = ("cov_xy", f"t_{t:02d}")
    col_list.extend([s_var, s_cov])
df_vars = pd.concat(col_list, axis=1)
df_vars = df_vars.loc[df_vars.index.get_level_values("ticker") != "SPY"]
df_vars.columns.names = ["stat", "time"]

df_beta = df_vars["cov_xy"].divide(df_vars["var_x"])
df_beta.columns = pd.MultiIndex.from_tuples([("beta_1", c) for c in df_beta.columns], names=["stat", "time"])

df_betas = pd.concat([df_vars, df_beta], axis=1)
df_betas = df_betas.sort_index(axis=1)
df_betas.index = df_betas.index.swaplevel()
df_betas = df_betas.sort_index()
df_betas.loc["AAPL"].head(60)

stat,beta_1,beta_1,beta_1,beta_1,beta_1,cov_xy,cov_xy,cov_xy,cov_xy,cov_xy,var_x,var_x,var_x,var_x,var_x
time,t_05,t_10,t_30,t_60,t_90,t_05,t_10,t_30,t_60,t_90,t_05,t_10,t_30,t_60,t_90
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
2016-01-04,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-05,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-06,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-07,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-08,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-11,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-12,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-13,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-14,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-15,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0


### Forward Looking 5 Box Car Coefficients

In [11]:
res = sm.OLS(y, X).fit()
res.summary()

NameError: name 'y' is not defined

In [None]:
df_cov = df_ret.loc["2020-04-27":"2020-05-11"].cov()
df_cov

ticker,SPY,SUN
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
SPY,0.000215,0.000218
SUN,0.000218,0.000756


In [None]:
beta = df_cov.SPY.iloc[1] / df_cov.SPY.iloc[0]
beta

1.01456983993913

In [None]:
len(df_ret.loc["2020-04-27":"2020-05-11"])

11

In [None]:
df_ret

ticker,SPY,SUN
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-04,,
2016-01-05,0.001691,-0.001263
2016-01-06,-0.012614,-0.030602
2016-01-07,-0.023992,-0.027133
2016-01-08,-0.010977,-0.016090
...,...,...
2020-12-24,0.003890,-0.004875
2020-12-28,0.008591,-0.011197
2020-12-29,-0.001908,0.004600
2020-12-30,0.001427,0.010919


In [None]:
cov_SPY_SUN = (df_ret.loc["2020-04-27":"2020-05-11"] - df_ret.loc["2020-04-27":"2020-05-11"].mean()).prod(axis=1).sum() / 10
cov_SPY_SUN

0.0002184964962972288

In [None]:
var_SPY_SUN = (df_ret.loc["2020-04-27":"2020-05-11"] - df_ret.loc["2020-04-27":"2020-05-11"].mean()).pow(2).sum() / 10
var_SPY_SUN

ticker
SPY    0.000215
SUN    0.000756
dtype: float64

In [None]:
df_ret.loc["2020-04-27":"2020-05-11"].mean().SUN - beta * df_ret.loc["2020-04-27":"2020-05-11"].mean().SPY

0.008069693987880534

## Check to see if