# Robust Regression

## Imports

In [1]:
# <include-robust_regression/utils.py>

In [69]:
# <imports>
import numpy as np
import pandas as pd
import plotly.io as pio

from robust_regression import utils

pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

## Summary

In [70]:
date_range = pd.date_range("2018-01-03", "2021-04-30", freq="7D")

In [71]:
df_spreads = (
    pd.read_csv("Liq5YCDS.delim", sep="\t", index_col=0, parse_dates=["date"])
    .set_index(["ticker", "date"])
    .unstack("ticker")
    .loc[date_range, ["spread5y"]]
)
df_returns = np.log(df_spreads / df_spreads.shift())
df_returns.columns = df_returns.columns.set_levels(["r_spread"], level=0)
df_spreads = pd.concat([df_spreads, df_returns], axis=1)
df_spreads


Unnamed: 0_level_0,spread5y,spread5y,spread5y,spread5y,spread5y,spread5y,spread5y,spread5y,spread5y,spread5y,...,r_spread,r_spread,r_spread,r_spread,r_spread,r_spread,r_spread,r_spread,r_spread,r_spread
ticker,BA,C,DD,F,GE,JPM,LOW,LUV,MAR,T,...,DD,F,GE,JPM,LOW,LUV,MAR,T,WFC,XOM
2018-01-03,0.001682,0.004080,0.002425,0.008474,0.004076,0.003763,0.001510,0.002989,0.002351,0.005180,...,,,,,,,,,,
2018-01-10,0.001585,0.004070,0.002455,0.008620,0.004115,0.003871,0.001400,0.003126,0.002438,0.005178,...,0.012211,0.017057,0.009481,0.028320,-0.075871,0.044720,0.036233,-0.000253,0.002157,-0.011262
2018-01-17,0.001728,0.004051,0.002570,0.009427,0.004752,0.003792,0.002029,0.003344,0.002773,0.005182,...,0.045783,0.089462,0.144074,-0.020688,0.371295,0.067317,0.128780,0.000701,0.003067,-0.019541
2018-01-24,0.001736,0.004076,0.002722,0.010133,0.005974,0.003790,0.002448,0.003590,0.002803,0.005176,...,0.057544,0.072222,0.228712,-0.000441,0.187634,0.071116,0.010832,-0.001168,-0.011862,-0.031642
2018-01-31,0.001708,0.003999,0.002825,0.010397,0.005928,0.003711,0.002598,0.003700,0.002602,0.005179,...,0.037060,0.025797,-0.007644,-0.021182,0.059667,0.030056,-0.074299,0.000574,-0.008492,0.003986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-03-31,0.012051,0.005761,0.003033,0.025899,0.008664,0.004895,0.003614,0.009034,0.008611,0.007013,...,0.046823,-0.027879,-0.073429,0.027238,-0.092663,-0.042060,0.008669,0.016319,0.024295,0.008793
2021-04-07,0.011651,0.005457,0.003028,0.023825,0.008434,0.004670,0.003363,0.008319,0.008386,0.006982,...,-0.001756,-0.083485,-0.026906,-0.047155,-0.071876,-0.082421,-0.026484,-0.004493,-0.048639,0.001937
2021-04-14,0.012230,0.005095,0.003055,0.023494,0.008256,0.004289,0.003364,0.008309,0.008525,0.006957,...,0.008849,-0.013991,-0.021344,-0.085086,0.000119,-0.001222,0.016437,-0.003616,-0.090404,-0.011578
2021-04-21,0.012743,0.005245,0.003202,0.023069,0.008086,0.004546,0.003460,0.008485,0.008832,0.007032,...,0.047049,-0.018256,-0.020827,0.058187,0.028257,0.020896,0.035377,0.010797,-0.006018,-0.012946


In [72]:
if False:
    tickers = df_g.columns.get_level_values("ticker").to_list()
    data = utils.fetch_all_tickers(tickers, {"start_date": "2018-01-01"})
    data.to_csv("df_prices.csv")
df_prices = (
    pd.read_csv("df_prices.csv", parse_dates=["date"])
    .set_index(["ticker", "date"])[["adj_close"]]
    .unstack("ticker")
    .reindex(date_range).ffill()
)
assert len(df_prices) == len(df_spreads)

df_prices.columns = df_prices.columns.set_names(["series", "ticker"])
df_returns = np.log(df_prices / df_prices.shift())
df_returns.columns = df_returns.columns.set_levels(["r_equity"], level=0)
df_prices = pd.concat([df_prices, df_returns], axis=1)
df_data = pd.concat([df_prices, df_spreads], axis=1).iloc[1:]
df_data = df_data.stack("ticker")
df_data.index = df_data.index.set_names(["date", "ticker"])

# Excludes return of subject security
df_data["r_index"] = (
    (df_data.groupby("date")["r_spread"].transform("sum") - df_data["r_spread"]) / 
    (df_data.groupby("date")["r_spread"].transform("count") - 1)
)
df_data.head()

Unnamed: 0_level_0,series,adj_close,r_equity,r_spread,spread5y,r_index
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-10,BA,305.145884,0.072711,-0.059278,0.001585,0.005486
2018-01-10,C,68.48299,0.014111,-0.002452,0.00407,0.00032
2018-01-10,DD,99.438127,0.01562,0.012211,0.002455,-0.001013
2018-01-10,F,11.202605,0.020939,0.017057,0.00862,-0.001454
2018-01-10,GE,17.577243,0.042025,0.009481,0.004115,-0.000765


## OLS with Intercept

In [73]:
df_errors, last_summary = utils.get_errors(df_data)
fig = utils.px.histogram(df_errors, "error", title="Distribution of Errors: OLS with Intercept", facet_col="distance")
fig.update_layout(width=1400)
fig.show()
print(last_summary)

                            OLS Regression Results                            
Dep. Variable:               r_spread   R-squared:                       0.224
Model:                            OLS   Adj. R-squared:                  0.216
Method:                 Least Squares   F-statistic:                     27.24
Date:                Wed, 12 May 2021   Prob (F-statistic):           4.03e-11
Time:                        19:41:20   Log-Likelihood:                 314.14
No. Observations:                 192   AIC:                            -622.3
Df Residuals:                     189   BIC:                            -612.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0035      0.004      0.985      0.3

In [63]:
df_errors.describe().T.iloc[:1]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
error,7392.0,-0.000477,0.080892,-0.811953,-0.029024,0.000646,0.030054,0.553429


In [64]:
df_errors.groupby("distance").describe()

Unnamed: 0_level_0,error,error,error,error,error,error,error,error
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
distance,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,1848.0,0.000512,0.080295,-0.72129,-0.028485,0.001046,0.030806,0.553429
2,1848.0,-0.000911,0.08156,-0.751374,-0.028642,0.000839,0.029832,0.522851
3,1848.0,-0.000266,0.081367,-0.802473,-0.029171,0.00011,0.029942,0.510374
4,1848.0,-0.001242,0.080393,-0.811953,-0.029863,0.000737,0.029869,0.50731


## OLS No Intercept


In [84]:
df_errors, last_summary = utils.get_errors(df_data, B0=0)
fig = utils.px.histogram(df_errors, "error", title="Distribution of Errors: OLS with Intercept")
fig.update_layout(width=1400)
fig.show()
print(last_summary)

                                 OLS Regression Results                                
Dep. Variable:               r_spread   R-squared (uncentered):                   0.221
Model:                            OLS   Adj. R-squared (uncentered):              0.213
Method:                 Least Squares   F-statistic:                              26.95
Date:                Wed, 12 May 2021   Prob (F-statistic):                    4.99e-11
Time:                        20:01:04   Log-Likelihood:                          313.65
No. Observations:                 192   AIC:                                     -623.3
Df Residuals:                     190   BIC:                                     -616.8
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [66]:
df_errors.describe().T.iloc[:1]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
error,7392.0,-7e-05,0.080689,-0.802419,-0.028725,0.000607,0.030035,0.551899


In [67]:
df_errors.groupby("distance").describe()

Unnamed: 0_level_0,error,error,error,error,error,error,error,error
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
distance,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,1848.0,0.00084,0.080031,-0.721231,-0.027991,0.0009,0.030708,0.551899
2,1848.0,-0.000479,0.081417,-0.746042,-0.028753,0.000707,0.02989,0.523187
3,1848.0,0.000149,0.081143,-0.7972,-0.028733,0.000274,0.029768,0.516659
4,1848.0,-0.00079,0.080212,-0.802419,-0.029559,0.000404,0.029549,0.518128


## Tukey with Intercept


In [90]:
df_errors, last_summary = utils.get_errors(df_data, model="RLM", penalty="Tukey", B0=1)
fig = utils.px.histogram(df_errors, "error", title="Distribution of Errors: RLM: Tukey with Intercept")
fig.update_layout(width=1400)
fig.show()
print(last_summary)

                    Robust linear Model Regression Results                    
Dep. Variable:               r_spread   No. Observations:                  192
Model:                            RLM   Df Residuals:                      189
Method:                          IRLS   Df Model:                            2
Norm:                   TukeyBiweight                                         
Scale Est.:                       mad                                         
Cov Type:                          H1                                         
Date:                Wed, 12 May 2021                                         
Time:                        20:06:12                                         
No. Iterations:                    22                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0057      0.003      1.635      0.1

In [91]:
df_errors.describe().T.iloc[:1]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
error,7392.0,-0.002096,0.079825,-0.828406,-0.031167,-0.000664,0.027426,0.695419


In [92]:
df_errors.groupby("distance").describe()

Unnamed: 0_level_0,error,error,error,error,error,error,error,error
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
distance,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,1848.0,-0.001919,0.079401,-0.735278,-0.030001,-0.000102,0.027514,0.51149
2,1848.0,-0.001333,0.080431,-0.734252,-0.031029,-0.000817,0.027333,0.695419
3,1848.0,-0.00247,0.080368,-0.828406,-0.031233,-0.001175,0.02754,0.553827
4,1848.0,-0.00266,0.079149,-0.806038,-0.031588,-0.000656,0.027159,0.55507


In [93]:
df_errors.groupby("distance").describe()

Unnamed: 0_level_0,error,error,error,error,error,error,error,error
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
distance,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,1848.0,-0.001919,0.079401,-0.735278,-0.030001,-0.000102,0.027514,0.51149
2,1848.0,-0.001333,0.080431,-0.734252,-0.031029,-0.000817,0.027333,0.695419
3,1848.0,-0.00247,0.080368,-0.828406,-0.031233,-0.001175,0.02754,0.553827
4,1848.0,-0.00266,0.079149,-0.806038,-0.031588,-0.000656,0.027159,0.55507
