In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf

### Data Preparation

In [2]:
stkdata = pd.read_sas('stkdata.sas7bdat', encoding='utf-8')

print(stkdata.head())

        DATE TICKER       RET
0 2011-01-31   AAPL  0.051959
1 2011-02-28   AAPL  0.040935
2 2011-03-31   AAPL -0.013314
3 2011-04-29   AAPL  0.004656
4 2011-05-31   AAPL -0.006569


In [3]:
stkdata = stkdata[stkdata['TICKER'].isin(['CVX', 'JNJ', 'PFE'])]

In [4]:
mktdata = pd.read_sas('mktdata.sas7bdat', encoding='utf-8')

In [5]:
Regdata = pd.merge(stkdata, mktdata, on='DATE')

In [6]:
Regdata['RETRF'] = Regdata['RET'] - Regdata['RF']

### CAPM estimation

In [8]:
for TIC in ['CVX']:
    tempdf = Regdata[Regdata['TICKER'] == TIC]
    mdl_CVX_CAPM = smf.ols('RETRF ~ MKTRF', data=tempdf).fit()
    print()
    print(TIC)
    print()
    print(mdl_CVX_CAPM.summary())


CVX

                            OLS Regression Results                            
Dep. Variable:                  RETRF   R-squared:                       0.531
Model:                            OLS   Adj. R-squared:                  0.527
Method:                 Least Squares   F-statistic:                     133.5
Date:                Sun, 02 May 2021   Prob (F-statistic):           4.11e-21
Time:                        15:26:03   Log-Likelihood:                 198.75
No. Observations:                 120   AIC:                            -393.5
Df Residuals:                     118   BIC:                            -387.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0093      0.004     -2.116   

In [9]:
def anova_table(mdl_result):
    anova_dict = {
        'Source':['Model','Error','Total'],
        'DF':[mdl_result.df_model, mdl_result.df_resid, mdl_result.df_model+mdl_result.df_resid],
        'Sum of Squares': [mdl_result.ess, mdl_result.ssr, mdl_result.centered_tss],
        'Mean Square':[mdl_result.mse_model, mdl_result.mse_resid,'']
    }
    anova_df = pd.DataFrame(anova_dict).set_index('Source')
    anova_df['DF'] = anova_df['DF'].astype('int')
    print(anova_df)

In [10]:
anova_table(mdl_CVX_CAPM)

         DF  Sum of Squares Mean Square
Source                                 
Model     1        0.289457    0.289457
Error   118        0.255938  0.00216897
Total   119        0.545395            


### Fama and French 3-Factor Model Estimation and F-Test against CAPM

In [11]:
for TIC in ['CVX']:
    tempdf = Regdata[Regdata['TICKER'] == TIC]
    mdl_CVX_FF3 = smf.ols('RETRF ~ MKTRF + SMB + HML', data=tempdf).fit()
    print()
    print(TIC)
    print()
    print(mdl_CVX_FF3.summary())
    print()
    hypotheses = '(SMB = 0), (HML=0)'
    f_test = mdl_CVX_FF3.f_test(hypotheses)
    print(f_test)


CVX

                            OLS Regression Results                            
Dep. Variable:                  RETRF   R-squared:                       0.611
Model:                            OLS   Adj. R-squared:                  0.600
Method:                 Least Squares   F-statistic:                     60.62
Date:                Sun, 02 May 2021   Prob (F-statistic):           1.20e-23
Time:                        15:27:18   Log-Likelihood:                 209.93
No. Observations:                 120   AIC:                            -411.9
Df Residuals:                     116   BIC:                            -400.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0046      0.004     -1.110   

In [12]:
anova_table(mdl_CVX_FF3)

         DF  Sum of Squares Mean Square
Source                                 
Model     3        0.332989    0.110996
Error   116        0.212406  0.00183108
Total   119        0.545395            


### Collinearity (VIF)

In [13]:
mdl_VIF1 = smf.ols('MKTRF ~ SMB + HML', data=mktdata).fit()
print('VIF1')
print()
print(mdl_VIF1.summary())
print()

mdl_VIF2 = smf.ols('SMB ~ MKTRF + HML', data=mktdata).fit()
print('VIF2')
print()
print(mdl_VIF2.summary())
print()

mdl_VIF3 = smf.ols('HML ~ MKTRF + SMB', data=mktdata).fit()
print('VIF3')
print()
print(mdl_VIF3.summary())
print()

VIF1

                            OLS Regression Results                            
Dep. Variable:                  MKTRF   R-squared:                       0.176
Model:                            OLS   Adj. R-squared:                  0.162
Method:                 Least Squares   F-statistic:                     12.52
Date:                Sun, 02 May 2021   Prob (F-statistic):           1.18e-05
Time:                        15:27:53   Log-Likelihood:                 224.47
No. Observations:                 120   AIC:                            -442.9
Df Residuals:                     117   BIC:                            -434.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0126      0.003      3.607   

### Constructing D Variable

In [14]:
Regdata.loc[Regdata['MKTRF']>=0, 'D'] =1
Regdata.loc[Regdata['MKTRF']< 0, 'D'] =0

In [15]:
# I picked .head(15) because it takes the fifth month of the sample to see the first D=0

Regdata.head(15)

Unnamed: 0,DATE,TICKER,RET,SMB,HML,MKTRF,RF,UMD,RETRF,D
0,2011-01-31,CVX,0.040329,-0.0252,0.0082,0.0199,0.0001,-0.0029,0.040229,1.0
1,2011-01-31,JNJ,-0.03363,-0.0252,0.0082,0.0199,0.0001,-0.0029,-0.03373,1.0
2,2011-01-31,PFE,0.040548,-0.0252,0.0082,0.0199,0.0001,-0.0029,0.040448,1.0
3,2011-02-28,CVX,0.100495,0.0153,0.0129,0.0349,0.0001,0.0208,0.100395,1.0
4,2011-02-28,JNJ,0.036975,0.0153,0.0129,0.0349,0.0001,0.0208,0.036875,1.0
5,2011-02-28,PFE,0.066959,0.0153,0.0129,0.0349,0.0001,0.0208,0.066859,1.0
6,2011-03-31,CVX,0.036048,0.0258,-0.0176,0.0046,0.0001,0.0352,0.035948,1.0
7,2011-03-31,JNJ,-0.035645,0.0258,-0.0176,0.0046,0.0001,0.0352,-0.035745,1.0
8,2011-03-31,PFE,0.055613,0.0258,-0.0176,0.0046,0.0001,0.0352,0.055513,1.0
9,2011-04-29,CVX,0.018141,-0.0037,-0.0243,0.029,0.0,0.0006,0.018141,1.0


### Chow Test

In [16]:
Regdata['DMKTRF'] = Regdata['MKTRF']*Regdata['D']
Regdata['DSMB'] = Regdata['SMB']*Regdata['D']
Regdata['DHML'] = Regdata['HML']*Regdata['D']

In [17]:
Regdata.head(15)

Unnamed: 0,DATE,TICKER,RET,SMB,HML,MKTRF,RF,UMD,RETRF,D,DMKTRF,DSMB,DHML
0,2011-01-31,CVX,0.040329,-0.0252,0.0082,0.0199,0.0001,-0.0029,0.040229,1.0,0.0199,-0.0252,0.0082
1,2011-01-31,JNJ,-0.03363,-0.0252,0.0082,0.0199,0.0001,-0.0029,-0.03373,1.0,0.0199,-0.0252,0.0082
2,2011-01-31,PFE,0.040548,-0.0252,0.0082,0.0199,0.0001,-0.0029,0.040448,1.0,0.0199,-0.0252,0.0082
3,2011-02-28,CVX,0.100495,0.0153,0.0129,0.0349,0.0001,0.0208,0.100395,1.0,0.0349,0.0153,0.0129
4,2011-02-28,JNJ,0.036975,0.0153,0.0129,0.0349,0.0001,0.0208,0.036875,1.0,0.0349,0.0153,0.0129
5,2011-02-28,PFE,0.066959,0.0153,0.0129,0.0349,0.0001,0.0208,0.066859,1.0,0.0349,0.0153,0.0129
6,2011-03-31,CVX,0.036048,0.0258,-0.0176,0.0046,0.0001,0.0352,0.035948,1.0,0.0046,0.0258,-0.0176
7,2011-03-31,JNJ,-0.035645,0.0258,-0.0176,0.0046,0.0001,0.0352,-0.035745,1.0,0.0046,0.0258,-0.0176
8,2011-03-31,PFE,0.055613,0.0258,-0.0176,0.0046,0.0001,0.0352,0.055513,1.0,0.0046,0.0258,-0.0176
9,2011-04-29,CVX,0.018141,-0.0037,-0.0243,0.029,0.0,0.0006,0.018141,1.0,0.029,-0.0037,-0.0243


In [18]:
for TIC in ['CVX']:
    tempdf = Regdata[Regdata['TICKER'] == TIC]
    mdl_CVX_dummy = smf.ols('RETRF ~ D + MKTRF + SMB + HML + DMKTRF + DSMB + DHML', data=tempdf).fit()
    print()
    print(TIC)
    print()
    print(mdl_CVX_dummy.summary())
    print()
    hypotheses = '(D=0), (DMKTRF=0), (DSMB = 0), (DHML=0)'
    f_test = mdl_CVX_dummy.f_test(hypotheses)
    print(f_test)


CVX

                            OLS Regression Results                            
Dep. Variable:                  RETRF   R-squared:                       0.628
Model:                            OLS   Adj. R-squared:                  0.604
Method:                 Least Squares   F-statistic:                     26.97
Date:                Sun, 02 May 2021   Prob (F-statistic):           2.30e-21
Time:                        15:28:50   Log-Likelihood:                 212.63
No. Observations:                 120   AIC:                            -409.3
Df Residuals:                     112   BIC:                            -387.0
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0089      0.010     -0.853   

In [20]:
anova_table(mdl_CVX_dummy)

         DF  Sum of Squares Mean Square
Source                                 
Model     7        0.342317   0.0489024
Error   112        0.203078   0.0018132
Total   119        0.545395            
