In [40]:
import pandas as pd
import numpy as np

three_factor_file_name = "./data/05_df_ff_info.csv"
three_factor_df = pd.read_csv(three_factor_file_name)

fundamental_file_name = "./data/funda.csv"
fundamental_df = pd.read_csv(fundamental_file_name)

monthly_stocks_file_name = "./data/monthlystocks.csv"
monthly_stocks_df = pd.read_csv(monthly_stocks_file_name)


display(three_factor_df.head())
display(fundamental_df.head())
display(monthly_stocks_df.head())

Unnamed: 0,Date,Mkt-RF,SMB,HML,RF
0,1980-01,0.0551,0.0162,0.0175,0.008
1,1980-02,-0.0122,-0.0185,0.0061,0.0089
2,1980-03,-0.129,-0.0664,-0.0101,0.0121
3,1980-04,0.0397,0.0105,0.0106,0.0126
4,1980-05,0.0526,0.0213,0.0038,0.0081


Unnamed: 0,GVKEY,LPERMNO,datadate,fyear,indfmt,consol,popsrc,datafmt,curcd,csho,xrd,exchg,costat,fic,mkvalt,prcc_f,sic
0,1000,25881,12/31/1975,1975.0,INDL,C,D,STD,USD,2.098,,12,I,USA,,4.375,3089.0
1,1000,25881,12/31/1976,1976.0,INDL,C,D,STD,USD,2.207,,12,I,USA,,5.75,3089.0
2,1000,25881,12/31/1977,1977.0,INDL,C,D,STD,USD,2.226,,12,I,USA,,9.25,3089.0
3,1001,10015,12/31/1983,1983.0,INDL,C,D,STD,USD,3.568,0.0,14,I,USA,,7.25,5812.0
4,1001,10015,12/31/1984,1984.0,INDL,C,D,STD,USD,3.568,,14,I,USA,,3.75,5812.0


Unnamed: 0,PERMNO,date,PRC,VOL,RET,SHROUT
0,10000,12/31/1985,,,,
1,10000,01/31/1986,-4.375,1771.0,C,3680.0
2,10000,02/28/1986,-3.25,828.0,-0.257143,3680.0
3,10000,03/31/1986,-4.4375,1078.0,0.365385,3680.0
4,10000,04/30/1986,-4.0,957.0,-0.098592,3793.0


Apply filters

In [41]:
# Only US companies
filter1 = fundamental_df['indfmt'] == 'INDL'
filter2 = fundamental_df['curcd'] == 'USD'
filter3 = fundamental_df['fic'] == 'USA'

# Only big exchanges
filter4 = fundamental_df['exchg'] >= 11
filter5 = fundamental_df['exchg'] <= 19

# Remove finance companies
filter6 = ((fundamental_df['sic'] < 6000) | (fundamental_df['sic'] > 6999))

# Apply filters
fundamental_df = fundamental_df[filter1 & filter2 & filter3 & filter4 & filter5 & filter6]

print(fundamental_df.shape)

(181467, 17)


Filter abnormal returns

In [42]:
# Filter out NA values in RET column
not_na = monthly_stocks_df['RET'].notna()
monthly_stocks_df = monthly_stocks_df[not_na]

# Filter out non-float values in RET column (e.g. 'C')
monthly_stocks_df["RET"] = monthly_stocks_df["RET"].apply(pd.to_numeric, errors='coerce')

# Filter out outliers
filter1 = monthly_stocks_df['RET'] > -100
filter2 = monthly_stocks_df['RET'] < 500
monthly_stocks_df = monthly_stocks_df[filter1 & filter2]


Calc market cap

In [43]:
filter1 = monthly_stocks_df['SHROUT'] > 0
monthly_stocks_df = monthly_stocks_df[filter1]

# Get most recent SHROUT if missing
monthly_stocks_df['SHROUT'].fillna(method='ffill', inplace=True)

# Fix price
monthly_stocks_df['PRC'] = abs(monthly_stocks_df['PRC'])
monthly_stocks_df['PRC'].replace(0, np.nan, inplace=True)

# Drop na
monthly_stocks_df.dropna(subset=['SHROUT', 'PRC'], inplace=True)

# Calculate market cap
monthly_stocks_df['MKT_CAP'] = monthly_stocks_df['PRC'] * monthly_stocks_df['SHROUT']

In [44]:
monthly_stocks_df.shape

(3459593, 7)

Parse date column and shift datadate by 3 months to avoid lookahead bias

In [45]:
# Fundamental data parse datadate column
fundamental_df['datadate'] = pd.to_datetime(fundamental_df['datadate'], format='%m/%d/%Y')

# Shift it forward by 3 months
fundamental_df['datadate'] = fundamental_df['datadate'] + pd.DateOffset(months=3)

# Create a year column
fundamental_df['year'] = fundamental_df['datadate'].dt.year

XRD calculation

In [46]:
# Fill xrd column with 0 if it is NA
fundamental_df['xrd'].fillna(0, inplace=True)

# Sort the dataframe
fundamental_df.sort_values(by=['datadate', 'LPERMNO'], inplace=True)
fundamental_df.reset_index(drop=True, inplace=True)

def calc_weighted_avg(five_period_series):
    # xt * 1 + xt-1 * 0.8 + xt-2 * 0.6 + xt-3 * 0.4 + xt-4 * 0.2
    return five_period_series[4] * 1 + five_period_series[3] * 0.8 + five_period_series[2] * 0.6 + five_period_series[1] * 0.4 + five_period_series[0] * 0.2

# Calculate 5-period average of xrd
rdc = fundamental_df.groupby('LPERMNO')['xrd'].rolling(5).apply(calc_weighted_avg, raw=True)
rdc.reset_index(level=0, drop=True, inplace=True)
rdc.sort_index(inplace=True)

fundamental_df['rdc'] = rdc

fundamental_df.dropna(subset=['rdc'], inplace=True)

Split firms into R&D firms 

In [47]:
non_rd_firms = fundamental_df[fundamental_df['rdc'] == 0]
rd_firms = fundamental_df[fundamental_df['rdc'] > 0]

In [48]:
print("The shape of the dataframes are {} and {}".format(non_rd_firms.shape, rd_firms.shape))
print("The shape of fundamental_df is {}".format(fundamental_df.shape))

The shape of the dataframes are (61209, 19) and (61083, 19)
The shape of fundamental_df is (122292, 19)


Drop the useless columns