In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
import statsmodels.api as sm
# import statsmodels.formula.api as smf
from tqdm.auto import tqdm

In [2]:
rf_rates = pd.read_csv("Data/rf_rates.csv", parse_dates=["date"], index_col="date")
rf_rates.head()

Unnamed: 0_level_0,mktrf,smb,hml,rf,umd
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-01-03,0.015,-0.002,0.0016,0.00017,0.0115
2006-01-04,0.0046,0.0036,0.0003,0.00017,0.0024
2006-01-05,0.0003,0.0028,-0.001,0.00017,-0.0055
2006-01-06,0.0092,0.0011,-0.0024,0.00017,0.0029
2006-01-09,0.0045,0.0052,-0.0017,0.00017,-0.0005


mktrf > Excess Return on the Market\
rf > Risk-Free Return Rate (One Month Treasury Bill Rate)

In [3]:
prices_df = pd.read_csv("Data\WRDS_prices.csv", parse_dates=["datadate"], index_col='datadate')
prices_df.rename(columns={'cik': 'CIK'}, inplace=True)
prices_df.head()

Unnamed: 0_level_0,CIK,VOLUME,TTLCMNSHARESOUT,CLOSEPRICE
datadate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-01-03,20,4632.0,2576000.0,37.03
2006-01-04,20,1806.0,2576000.0,36.5
2006-01-05,20,495.0,2576000.0,36.75
2006-01-06,20,1100.0,2576000.0,37.0
2006-01-09,20,1250.0,2576000.0,37.0


In [4]:
# Calculate returns
# prices_df["CLOSEPRICE"] = prices_df.groupby("CIK")["CLOSEPRICE"].ffill(limit=1)
prices_df["Return"] = prices_df.groupby("CIK")["CLOSEPRICE"].pct_change(1, fill_method=None)
# prices_df["log_Return"] = np.log(prices_df["Return"] + 1)

In [5]:
Beta_df = pd.merge(
    left=prices_df[['CIK', 'Return']],
    right=rf_rates,
    left_index=True,
    right_index=True,
    how='left'
)

In [6]:
Beta_df["ex_Return"] = Beta_df["Return"] - Beta_df["rf"]
Beta_df.dropna(subset=['ex_Return', 'mktrf', 'smb', 'hml'], inplace=True)
Beta_df.drop(columns=['umd'], inplace=True)
Beta_df.head()

Unnamed: 0_level_0,CIK,Return,mktrf,smb,hml,rf,ex_Return
datadate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-01-04,20,-0.014313,0.0046,0.0036,0.0003,0.00017,-0.014483
2006-01-05,20,0.006849,0.0003,0.0028,-0.001,0.00017,0.006679
2006-01-06,20,0.006803,0.0092,0.0011,-0.0024,0.00017,0.006633
2006-01-09,20,0.0,0.0045,0.0052,-0.0017,0.00017,-0.00017
2006-01-11,20,-0.0125,0.0028,-0.0023,-0.0011,0.00017,-0.01267


In [7]:
N = 63 # 3 month

Beta_df[f"mktrf_var_{N}"] = (
    Beta_df.groupby('CIK')["mktrf"]
    .rolling(N, min_periods=N//2).var()
).values

Beta_df[f"covar_{N}"] = (
    Beta_df.groupby("CIK")[["mktrf", "ex_Return"]]
    .rolling(N, min_periods=N//2).cov()
    .unstack()["mktrf"]["ex_Return"]
).values

Beta_df[f"Beta_{N}"] = Beta_df[f"covar_{N}"] / Beta_df[f"mktrf_var_{N}"]

In [8]:
N = 126 # 6 month

Beta_df[f"mktrf_var_{N}"] = (
    Beta_df.groupby('CIK')["mktrf"]
    .rolling(N, min_periods=N//2).var()
).values

Beta_df[f"covar_{N}"] = (
    Beta_df.groupby("CIK")[["mktrf", "ex_Return"]]
    .rolling(N, min_periods=N//2).cov()
    .unstack()["mktrf"]["ex_Return"]
).values

Beta_df[f"Beta_{N}"] = Beta_df[f"covar_{N}"] / Beta_df[f"mktrf_var_{N}"]

In [9]:
N = 252 # 1 year

Beta_df[f"mktrf_var_{N}"] = (
    Beta_df.groupby('CIK')["mktrf"]
    .rolling(N, min_periods=N//2).var()
).values

Beta_df[f"covar_{N}"] = (
    Beta_df.groupby("CIK")[["mktrf", "ex_Return"]]
    .rolling(N, min_periods=N//2).cov()
    .unstack()["mktrf"]["ex_Return"]
).values

Beta_df[f"Beta_{N}"] = Beta_df[f"covar_{N}"] / Beta_df[f"mktrf_var_{N}"]

In [11]:
# Abnormal Return = Actual Return - Expected Return 
# Estimation window: 10 days before the event day
Beta_df["AR"] = Beta_df["Return"] - (
    Beta_df["rf"] + Beta_df.groupby("CIK")["Beta_126"].shift(10)*Beta_df["mktrf"]
)

In [13]:
# Cumulative AR - Event window: -5 and +5 days
# sum of values over the event window at the event day
components = (
    Beta_df.groupby("CIK")[["Return", "rf", "mktrf"]]
    .rolling(11, min_periods=6).sum().groupby("CIK").shift(-5)
)

Beta_df["CAR_5"] = (
    (components["Return"] - components["rf"]).values - 
    Beta_df.groupby("CIK")["Beta_126"].shift(10).values * components["mktrf"].values)

In [14]:
# Cumulative AR - Event window: -5 and +10 days
# sum of values over the event window at the event day (.shift(-10))
components = (
    Beta_df.groupby("CIK")[["Return", "rf", "mktrf"]]
    .rolling(16, min_periods=9).sum().groupby("CIK").shift(-10)
)

Beta_df["CAR_10"] = (
    (components["Return"] - components["rf"]).values - 
    Beta_df.groupby("CIK")["Beta_126"].shift(10).values * components["mktrf"].values)

In [12]:
# Cumulative AR - Event window: -2 and 2 days
# sum of values over the event window at the event day
components = (
    Beta_df.groupby("CIK")[["Return", "rf", "mktrf"]]
    .rolling(5, min_periods=3).sum().groupby("CIK").shift(-2)
)

Beta_df["CAR_2"] = (
    (components["Return"] - components["rf"]).values - 
    Beta_df.groupby("CIK")["Beta_126"].shift(10).values * components["mktrf"].values)

In [17]:
Beta_df.reset_index(inplace=True)
Beta_df.dropna(subset='Beta_63', inplace=True)
Beta_df.drop(
    columns=[
        'Return', 'mktrf', 'smb', 'hml', 'rf', 'ex_Return', 'mktrf_var_63', 'covar_63', 
        'mktrf_var_126', 'covar_126', 'mktrf_var_252', 'covar_252', 'AR'
    ],
    inplace=True
)

In [23]:
Beta_df.isna().sum()

datadate          0
CIK               0
Beta_63           0
Beta_126     424026
Beta_252    1249182
CAR_2        582126
CAR_5        621596
CAR_10       687307
dtype: int64

In [26]:
Beta_df.to_csv("Data\Beta_AR.csv", index=False)

In [None]:
market_idx = pd.read_csv('Data\market_daily.csv')

market_idx["DATE"] = pd.to_datetime(market_idx["DATE"])
market_idx.set_index("DATE", inplace=True)

vwretd > Value-Weighted Return (includes distributions)\
vwretx > Value-Weighted Return (excluding dividends)\
ewretd > Equal-Weighted Return (includes distributions)\
ewretx > Equal-Weighted Return (excluding dividends)\
totval > Total Market Value\
totcnt > Total Market Count\
usdval > Market Value of Securities Used\
usdcnt > Count of Securities Used\
sprtrn > Return on S&P Composite Index\
spindx > Level on S&P Composite Index

Fama French Beta

In [None]:
def calc_beta(data, N):
    betas = []
    for date in data.index:
        start = date - pd.Timedelta(days=N)
        if start >= data.index.min():
            Y = data.loc[start:date, 'ex_Return']
            X = data.loc[start:date, ['mktrf', 'smb', 'hml']]
            X = sm.add_constant(X)
            model = sm.OLS(Y, X).fit()
            betas.append(model.params[['mktrf', 'smb', 'hml']])
    return betas

In [None]:
B_data = Beta_df[Beta_df['CIK']==1750]
betas = calc_beta(data=B_data, N=182)

In [None]:
Beta_list = []
n = 182
# Run the rolling regression for each security
for cik in tqdm(Beta_df['CIK'].unique()):
    B_data = Beta_df[Beta_df['CIK']==cik]
    betas = calc_beta(data=B_data, N=n)
    df = pd.DataFrame(betas, index=B_data.index[-len(betas):])
    df["CIK"] = cik
    Beta_list.append(df)

In [None]:
FF_Betas = pd.concat(Beta_list).reset_index().rename(columns={'mktrf': 'B_mktrf', 'smb': 'B_smb', 'hml': 'B_hml'})

In [None]:
FF_Betas

Clean WRDS prices dataset

In [3]:
prices_df=pd.read_csv("WRDS_prices.csv", usecols=['iid', 'datadate', 'cusip', 'ajexdi', 'cshoc', 'cshtrd', 'prccd', 'tpci', 'cik'])

  prices_df=pd.read_csv("WRDS_prices.csv", usecols=['iid', 'datadate', 'cusip', 'ajexdi', 'cshoc', 'cshtrd', 'prccd', 'tpci', 'cik'])


In [4]:
prices_df.shape

(87057203, 9)

In [5]:
cikmap = pd.read_csv('Data/CIK_Ticker_CUSIP.csv', usecols=['cik', 'cusip']).dropna()
cikmap['cusip'] = cikmap['cusip'].str.split()
cikmap = cikmap.explode('cusip')
prices_df = pd.merge(
    left=prices_df,
    right=cikmap,
    on='cusip',
    how='left',
    suffixes=("", "_2")
)
prices_df.fillna({'cik': prices_df['cik_2']}, inplace=True)
prices_df.dropna(subset='cik', inplace=True)
prices_df.drop(columns=['cik_2', 'cusip'], inplace=True)
prices_df['cik'] = prices_df['cik'].astype(int)

prices_df.dropna(subset='prccd', inplace=True)
prices_df['CLOSEPRICE'] = prices_df['prccd']/prices_df['ajexdi']

In [6]:
# keep CIKs that are in disclosure database
CIKs = pd.read_csv(
    filepath_or_buffer="Data/All_1Afiles.csv", 
    usecols=['CIK']).drop_duplicates()

prices_df = prices_df[prices_df['cik'].isin(CIKs['CIK'].unique())]

In [None]:
# drop ciks with multiple issues
prices_df = prices_df.sort_values(['cik', 'datadate', 'tpci', 'iid']).reset_index(drop=True)
prices_df = prices_df[(prices_df['tpci'].isin(['0', 0, '1', 1]))&(prices_df['cshtrd']>0)]
prices_df['cshoc'] = prices_df.groupby(['cik', 'tpci'])['cshoc'].ffill()

In [12]:
prices_df = prices_df.groupby(['cik', 'datadate']).agg(
    VOLUME=('cshtrd', 'sum'),
    TTLCMNSHARESOUT=('cshoc', 'sum'),
    CLOSEPRICE=('CLOSEPRICE', 'mean')
).reset_index()

In [13]:
prices_df.to_csv("Data/WRDS_prices.csv", index=False)

In [None]:
def calc_beta(B_data):
    # Define the regression formula
    FamaFrench_model = smf.ols(formula='ex_Return ~ mktrf + smb + hml', data=B_data)

    # Fit the regression
    FamaFrench_fit = FamaFrench_model.fit()

    return FamaFrench_fit.params['mktrf']

calc_beta(Beta_df[Beta_df['CIK']==1750])

In [None]:
def calc_beta(data, N):
    betas = []
    for start in range(len(data) - N + 1):
        end = start + N
        Y = data.iloc[start:end, 4]
        X = data.iloc[start:end, 1:4]
        X = sm.add_constant(X)
        model = sm.OLS(Y, X).fit()
        betas.append(model.params[['mktrf', 'smb', 'hml']])
    return betas