In [2]:

import pandas as pd
import numpy as np

from statsmodels.api import OLS, add_constant
import pandas_datareader.data as web

from linearmodels.asset_pricing import LinearFactorModel

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:

sns.set_style('whitegrid')

In [4]:
ff_factor = 'F-F_Research_Data_5_Factors_2x3'
ff_factor_data = web.DataReader(ff_factor, 'famafrench', start='2010', end='2017-12')[0]
ff_factor_data.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 96 entries, 2010-01 to 2017-12
Freq: M
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mkt-RF  96 non-null     float64
 1   SMB     96 non-null     float64
 2   HML     96 non-null     float64
 3   RMW     96 non-null     float64
 4   CMA     96 non-null     float64
 5   RF      96 non-null     float64
dtypes: float64(6)
memory usage: 5.2 KB


In [5]:
ff_factor_data.describe()

Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA,RF
count,96.0,96.0,96.0,96.0,96.0,96.0
mean,1.158646,0.054063,-0.051875,0.126146,0.052813,0.012604
std,3.580167,2.290739,2.191647,1.591265,1.409858,0.022583
min,-7.89,-4.51,-4.52,-3.93,-3.35,0.0
25%,-0.9175,-1.66,-1.6275,-1.16,-0.965,0.0
50%,1.235,0.19,-0.305,0.135,-0.015,0.0
75%,3.1975,1.5175,1.1425,1.14,0.9275,0.01
max,11.35,6.8,8.22,3.53,3.78,0.09


In [7]:
ff_portfolio = '17_Industry_Portfolios'
ff_portfolio_data = web.DataReader(ff_portfolio, 'famafrench', start='2010', end='2017-12')[0]
ff_portfolio_data = ff_portfolio_data.sub(ff_factor_data.RF, axis=0)
ff_portfolio_data.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 96 entries, 2010-01 to 2017-12
Freq: M
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Food    96 non-null     float64
 1   Mines   96 non-null     float64
 2   Oil     96 non-null     float64
 3   Clths   96 non-null     float64
 4   Durbl   96 non-null     float64
 5   Chems   96 non-null     float64
 6   Cnsum   96 non-null     float64
 7   Cnstr   96 non-null     float64
 8   Steel   96 non-null     float64
 9   FabPr   96 non-null     float64
 10  Machn   96 non-null     float64
 11  Cars    96 non-null     float64
 12  Trans   96 non-null     float64
 13  Utils   96 non-null     float64
 14  Rtail   96 non-null     float64
 15  Finan   96 non-null     float64
 16  Other   96 non-null     float64
dtypes: float64(17)
memory usage: 13.5 KB


In [8]:
ff_portfolio_data.describe()

Unnamed: 0,Food,Mines,Oil,Clths,Durbl,Chems,Cnsum,Cnstr,Steel,FabPr,Machn,Cars,Trans,Utils,Rtail,Finan,Other
count,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0
mean,1.045625,0.197083,0.547917,1.396979,1.155208,1.303229,1.13625,1.73125,0.555625,1.350521,1.227604,1.278854,1.465,0.890313,1.234375,1.241562,1.282396
std,2.795857,7.902185,5.577552,5.025167,5.137482,5.594216,3.174283,5.246562,7.389824,4.694408,4.811242,5.718887,4.150833,3.23514,3.508655,4.809791,3.708972
min,-5.17,-24.38,-12.01,-10.0,-13.21,-17.39,-7.3,-13.96,-20.49,-11.96,-9.08,-11.65,-8.56,-6.99,-9.18,-11.04,-7.92
25%,-0.785,-5.8475,-3.1675,-1.865,-2.0175,-1.445,-0.92,-2.4625,-4.41,-1.4475,-2.0475,-1.245,-0.88,-0.745,-0.9625,-1.4675,-1.075
50%,0.93,-0.46,1.04,1.16,1.205,1.435,1.47,2.19,0.66,1.485,1.545,0.645,1.505,1.215,0.88,1.955,1.575
75%,3.1875,5.715,3.915,3.8575,4.3225,4.4425,3.3175,5.39,4.22,3.8375,4.6575,4.8025,4.235,2.9525,3.355,4.0925,3.5175
max,6.67,21.92,16.3,17.2,16.58,18.37,8.29,15.55,21.35,17.66,14.65,20.86,13.16,7.9,12.36,13.48,10.79


In [10]:
with pd.HDFStore('C:/Users/86155/Machine Learning for Algorithmic Trading/data/assets.h5') as store:
    prices = store['/quandl/wiki/prices'].adj_close.unstack().loc['2010':'2017']
    equities = store['/us_equities/stocks'].drop_duplicates()

In [11]:
sectors = equities.filter(prices.columns, axis=0).sector.to_dict()
prices = prices.filter(sectors.keys()).dropna(how='all', axis=1)

In [15]:

returns = prices.resample('M').last().pct_change().mul(100).to_period('M')
returns = returns.dropna(how='all').dropna(axis=1)
returns.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 95 entries, 2010-02 to 2017-12
Freq: M
Columns: 1986 entries, A to ZUMZ
dtypes: float64(1986)
memory usage: 1.4 MB


In [16]:
ff_factor_data = ff_factor_data.loc[returns.index]
ff_portfolio_data = ff_portfolio_data.loc[returns.index]

In [17]:
ff_factor_data.describe()

Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA,RF
count,95.0,95.0,95.0,95.0,95.0,95.0
mean,1.206211,0.051053,-0.055895,0.139895,0.048842,0.012737
std,3.568537,2.302701,2.202918,1.593964,1.416798,0.022665
min,-7.89,-4.51,-4.52,-3.93,-3.35,0.0
25%,-0.565,-1.67,-1.655,-0.965,-0.99,0.0
50%,1.29,0.15,-0.36,0.14,-0.02,0.0
75%,3.265,1.555,1.165,1.14,0.935,0.01
max,11.35,6.8,8.22,3.53,3.78,0.09


In [18]:
excess_returns = returns.sub(ff_factor_data.RF, axis=0)
excess_returns.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 95 entries, 2010-02 to 2017-12
Freq: M
Columns: 1986 entries, A to ZUMZ
dtypes: float64(1986)
memory usage: 1.4 MB


In [19]:
excess_returns = excess_returns.clip(lower=np.percentile(excess_returns, 1),
                                     upper=np.percentile(excess_returns, 99))

In [20]:

ff_portfolio_data.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 95 entries, 2010-02 to 2017-12
Freq: M
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Food    95 non-null     float64
 1   Mines   95 non-null     float64
 2   Oil     95 non-null     float64
 3   Clths   95 non-null     float64
 4   Durbl   95 non-null     float64
 5   Chems   95 non-null     float64
 6   Cnsum   95 non-null     float64
 7   Cnstr   95 non-null     float64
 8   Steel   95 non-null     float64
 9   FabPr   95 non-null     float64
 10  Machn   95 non-null     float64
 11  Cars    95 non-null     float64
 12  Trans   95 non-null     float64
 13  Utils   95 non-null     float64
 14  Rtail   95 non-null     float64
 15  Finan   95 non-null     float64
 16  Other   95 non-null     float64
dtypes: float64(17)
memory usage: 13.4 KB


In [21]:
ff_factor_data = ff_factor_data.drop('RF', axis=1)
ff_factor_data.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 95 entries, 2010-02 to 2017-12
Freq: M
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mkt-RF  95 non-null     float64
 1   SMB     95 non-null     float64
 2   HML     95 non-null     float64
 3   RMW     95 non-null     float64
 4   CMA     95 non-null     float64
dtypes: float64(5)
memory usage: 4.5 KB


In [23]:
betas = []
for industry in ff_portfolio_data:
    step1 = OLS(endog = ff_portfolio_data.loc[ff_factor_data.index,industry], exog = add_constant(ff_factor_data)).fit()
    betas.append(step1.params.drop('const'))

In [26]:

betas = pd.DataFrame(betas, 
                     columns=ff_factor_data.columns, 
                     index=ff_portfolio_data.columns)
betas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17 entries, Food  to Other
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mkt-RF  17 non-null     float64
 1   SMB     17 non-null     float64
 2   HML     17 non-null     float64
 3   RMW     17 non-null     float64
 4   CMA     17 non-null     float64
dtypes: float64(5)
memory usage: 1.3+ KB


In [27]:
betas

Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA
Food,0.686576,-0.310043,-0.349134,0.307384,0.466381
Mines,1.298828,0.180687,0.18895,0.145656,0.611137
Oil,1.055321,0.155424,0.668664,-0.024625,0.311558
Clths,0.968569,0.342928,-0.187901,0.564801,0.037814
Durbl,1.174077,0.537849,0.070705,0.511997,-0.13124
Chems,1.351075,0.166163,0.19517,0.141558,-0.230187
Cnsum,0.762473,-0.332759,-0.577102,-0.060718,0.5745
Cnstr,1.116148,0.446255,0.092263,-0.010816,0.140574
Steel,1.464861,0.41049,0.400025,0.136067,0.483913
FabPr,1.069532,0.460182,-0.029259,0.153257,0.18636


In [36]:
lambdas = []
for period in ff_portfolio_data.index:
    step2 = OLS(endog=ff_portfolio_data.loc[period], 
                exog=betas).fit()
    lambdas.append(step2.params)

In [37]:
lambdas = pd.DataFrame(lambdas, 
                       index=ff_portfolio_data.index,
                       columns=betas.columns.tolist())
lambdas.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 95 entries, 2010-02 to 2017-12
Freq: M
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mkt-RF  95 non-null     float64
 1   SMB     95 non-null     float64
 2   HML     95 non-null     float64
 3   RMW     95 non-null     float64
 4   CMA     95 non-null     float64
dtypes: float64(5)
memory usage: 9.3 KB


In [38]:
lambdas

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RMW,CMA
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-02,3.740339,5.689958,-1.217749,1.600576,-0.165497
2010-03,6.253866,0.161723,-0.242763,2.363063,-1.205920
2010-04,1.790520,7.708622,-3.703789,-1.046971,-2.882010
2010-05,-7.728963,3.636641,-1.010362,0.799494,0.132338
2010-06,-5.309910,-5.861009,-1.546481,-2.468986,0.053437
...,...,...,...,...,...
2017-08,0.216417,0.371040,-2.950801,-4.476634,0.347925
2017-09,2.987877,5.435670,2.297705,-2.829133,-1.014815
2017-10,2.340509,-2.182159,0.506346,1.920259,-3.711163
2017-11,2.882542,-2.403978,-1.285934,5.767601,-2.748155
