In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf

### Read Company Return Data from `stkdata.sas7bdat`

In [2]:
stkdata = pd.read_sas('stkdata.sas7bdat', encoding='utf-8')

print(stkdata.head())

        DATE TICKER       RET
0 2011-01-31   AAPL  0.051959
1 2011-02-28   AAPL  0.040935
2 2011-03-31   AAPL -0.013314
3 2011-04-29   AAPL  0.004656
4 2011-05-31   AAPL -0.006569


In [3]:
print(stkdata.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   DATE    6000 non-null   datetime64[ns]
 1   TICKER  6000 non-null   object        
 2   RET     6000 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 140.8+ KB
None


### Now we pick `AAPL`, `INTC`, and `MSFT`

In [4]:
stkdata = stkdata[stkdata['TICKER'].isin(['AAPL', 'INTC', 'MSFT'])]

In [5]:
print(stkdata.head())

        DATE TICKER       RET
0 2011-01-31   AAPL  0.051959
1 2011-02-28   AAPL  0.040935
2 2011-03-31   AAPL -0.013314
3 2011-04-29   AAPL  0.004656
4 2011-05-31   AAPL -0.006569


In [6]:
print(stkdata.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 360 entries, 0 to 4079
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   DATE    360 non-null    datetime64[ns]
 1   TICKER  360 non-null    object        
 2   RET     360 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 11.2+ KB
None


### Read market data from `mktdata.sas7bdat`

In [7]:
mktdata = pd.read_sas('mktdata.sas7bdat', encoding='utf-8')

In [8]:
print(mktdata.head())

        DATE     SMB     HML   MKTRF      RF     UMD
0 2011-01-31 -0.0252  0.0082  0.0199  0.0001 -0.0029
1 2011-02-28  0.0153  0.0129  0.0349  0.0001  0.0208
2 2011-03-31  0.0258 -0.0176  0.0046  0.0001  0.0352
3 2011-04-29 -0.0037 -0.0243  0.0290  0.0000  0.0006
4 2011-05-31 -0.0058 -0.0205 -0.0127  0.0000 -0.0057


In [9]:
print(mktdata.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   DATE    120 non-null    datetime64[ns]
 1   SMB     120 non-null    float64       
 2   HML     120 non-null    float64       
 3   MKTRF   120 non-null    float64       
 4   RF      120 non-null    float64       
 5   UMD     120 non-null    float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 5.8 KB
None


### Obtain the summary statistics of variables in `mktdata`

Please note that the `10 percentile` of `MKTRF` is different from the answer of SAS. We have 120 observations, so the `10 percentile` is between the smallest 12th to the smallest 13th. SAS simply took the average of these two numbers, while Python here got slightly above the smallest 12th observation.

In [10]:
print(mktdata[['MKTRF','RF']].describe(percentiles=[.10,.90]))

            MKTRF          RF
count  120.000000  120.000000
mean     0.011495    0.000456
std      0.041239    0.000661
min     -0.133800    0.000000
10%     -0.033510    0.000000
50%      0.013250    0.000100
90%      0.055800    0.001600
max      0.136500    0.002100


### Merge these two data files: `stkdata` and `mktdata`

In [11]:
Regdata = pd.merge(stkdata, mktdata, on='DATE')

In [12]:
print(Regdata.head())

        DATE TICKER       RET     SMB     HML   MKTRF      RF     UMD
0 2011-01-31   AAPL  0.051959 -0.0252  0.0082  0.0199  0.0001 -0.0029
1 2011-01-31   INTC  0.020447 -0.0252  0.0082  0.0199  0.0001 -0.0029
2 2011-01-31   MSFT -0.006628 -0.0252  0.0082  0.0199  0.0001 -0.0029
3 2011-02-28   AAPL  0.040935  0.0153  0.0129  0.0349  0.0001  0.0208
4 2011-02-28   INTC  0.008910  0.0153  0.0129  0.0349  0.0001  0.0208


In [13]:
print(Regdata.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 360 entries, 0 to 359
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   DATE    360 non-null    datetime64[ns]
 1   TICKER  360 non-null    object        
 2   RET     360 non-null    float64       
 3   SMB     360 non-null    float64       
 4   HML     360 non-null    float64       
 5   MKTRF   360 non-null    float64       
 6   RF      360 non-null    float64       
 7   UMD     360 non-null    float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 25.3+ KB
None


In [14]:
Regdata['RETRF'] = Regdata['RET'] - Regdata['RF']

In [15]:
print(Regdata)

          DATE TICKER       RET     SMB     HML   MKTRF      RF     UMD  \
0   2011-01-31   AAPL  0.051959 -0.0252  0.0082  0.0199  0.0001 -0.0029   
1   2011-01-31   INTC  0.020447 -0.0252  0.0082  0.0199  0.0001 -0.0029   
2   2011-01-31   MSFT -0.006628 -0.0252  0.0082  0.0199  0.0001 -0.0029   
3   2011-02-28   AAPL  0.040935  0.0153  0.0129  0.0349  0.0001  0.0208   
4   2011-02-28   INTC  0.008910  0.0153  0.0129  0.0349  0.0001  0.0208   
..         ...    ...       ...     ...     ...     ...     ...     ...   
355 2020-11-30   INTC  0.099368  0.0548  0.0211  0.1247  0.0001 -0.1225   
356 2020-11-30   MSFT  0.060058  0.0548  0.0211  0.1247  0.0001 -0.1225   
357 2020-12-31   AAPL  0.114574  0.0481 -0.0136  0.0463  0.0001 -0.0242   
358 2020-12-31   INTC  0.030403  0.0481 -0.0136  0.0463  0.0001 -0.0242   
359 2020-12-31   MSFT  0.039006  0.0481 -0.0136  0.0463  0.0001 -0.0242   

        RETRF  
0    0.051859  
1    0.020347  
2   -0.006728  
3    0.040835  
4    0.008810  
.. 

### Note:

We don't have missing observations in this case, so the default merge is good. Even if we have missing observations, we will skip them automatically when running regressions anyway.

### Summary statistics of `RETRF`

- We inclued `RETRF` and `TICKER` in the dataset for `.describe()`. `TICKER` is included because we use it in `.groupby()`.
- Again, the definition of `10 percentile` and `90 percentile` is slightly different from SAS.

In [16]:
print(Regdata[['RETRF','TICKER']].groupby('TICKER').describe(percentiles=[.10,.90]))

        RETRF                                                              \
        count      mean       std       min       10%       50%       90%   
TICKER                                                                      
AAPL    120.0  0.024457  0.079606 -0.182509 -0.076892  0.024895  0.127021   
INTC    120.0  0.011526  0.066703 -0.202340 -0.077111  0.015477  0.093320   
MSFT    120.0  0.020553  0.058081 -0.130248 -0.057998  0.020384  0.086073   

                  
             max  
TICKER            
AAPL    0.216209  
INTC    0.193690  
MSFT    0.196409  


### Run regression for each firm at once

- We extract the observations from the same company to `tempdf` with the `for` loop each time

In [17]:
for TIC in ['AAPL', 'INTC', 'MSFT']:
    tempdf = Regdata[Regdata['TICKER'] == TIC]
    mdl = smf.ols('RETRF ~ MKTRF', data=tempdf).fit()
    print()
    print(TIC)
    print()
    print(mdl.summary())


AAPL

                            OLS Regression Results                            
Dep. Variable:                  RETRF   R-squared:                       0.306
Model:                            OLS   Adj. R-squared:                  0.300
Method:                 Least Squares   F-statistic:                     52.02
Date:                Mon, 22 Mar 2021   Prob (F-statistic):           5.71e-11
Time:                        12:56:04   Log-Likelihood:                 155.82
No. Observations:                 120   AIC:                            -307.6
Df Residuals:                     118   BIC:                            -302.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0122      0.006      1.930  

### Multiple Regressions (Not Required in this homework)

Please note that we use `df.unique()` to generate a list of companies instead of a given list in this example.

In [18]:
for TIC in Regdata['TICKER'].unique():
    tempdf = Regdata[Regdata['TICKER'] == TIC]
    mdl = smf.ols('RETRF ~ MKTRF + SMB + HML + UMD', data=tempdf).fit()
    print()
    print(TIC)
    print()
    print(mdl.summary())


AAPL

                            OLS Regression Results                            
Dep. Variable:                  RETRF   R-squared:                       0.389
Model:                            OLS   Adj. R-squared:                  0.367
Method:                 Least Squares   F-statistic:                     18.28
Date:                Mon, 22 Mar 2021   Prob (F-statistic):           1.19e-11
Time:                        12:56:04   Log-Likelihood:                 163.44
No. Observations:                 120   AIC:                            -316.9
Df Residuals:                     115   BIC:                            -303.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0066      0.006      1.064  