In [1]:
import pandas as pd
import numpy as np

# Read in data
rd_firms = pd.read_pickle('data/rd_firms.pkl')
non_rd_firms = pd.read_pickle('data/non_rd_firms.pkl')
monthly_stocks_df = pd.read_pickle('data/monthly_stocks_df.pkl')

# Read in three factor model
three_factor_model = pd.read_csv('data/05_df_ff_info.csv')

In [2]:
# In three factor model make date end of month
three_factor_model['Date'] = pd.to_datetime(three_factor_model['Date'])
three_factor_model['Date'] = three_factor_model['Date'] + pd.offsets.MonthEnd(0)

display(three_factor_model.head())

Unnamed: 0,Date,Mkt-RF,SMB,HML,RF
0,1980-01-31,0.0551,0.0162,0.0175,0.008
1,1980-02-29,-0.0122,-0.0185,0.0061,0.0089
2,1980-03-31,-0.129,-0.0664,-0.0101,0.0121
3,1980-04-30,0.0397,0.0105,0.0106,0.0126
4,1980-05-31,0.0526,0.0213,0.0038,0.0081


In [3]:
display(monthly_stocks_df)
import datetime

start_period = datetime.datetime(1981, 7, 1)
mid_period = datetime.datetime(1999, 12, 31)
end_period = datetime.datetime(2012, 12, 31)
extended_period = datetime.datetime(2021, 12, 31)

pre_2000 = (monthly_stocks_df['date'] >= start_period) & (monthly_stocks_df['date'] <= mid_period)
post_2000 = (monthly_stocks_df['date'] >= mid_period) & (monthly_stocks_df['date'] <= end_period)
full_period = (monthly_stocks_df['date'] >= start_period) & (monthly_stocks_df['date'] <= end_period)
extended_period = (monthly_stocks_df['date'] >= start_period) & (monthly_stocks_df['date'] <= extended_period)

Unnamed: 0,PERMNO,date,PRC,VOL,RET,SHROUT,MKT_CAP
2,10000,1986-02-28,3.25000,828.0,-0.257143,3680.0,1.196000e+04
3,10000,1986-03-31,4.43750,1078.0,0.365385,3680.0,1.633000e+04
4,10000,1986-04-30,4.00000,957.0,-0.098592,3793.0,1.517200e+04
5,10000,1986-05-31,3.10938,1074.0,-0.222656,3793.0,1.179388e+04
6,10000,1986-06-30,3.09375,1069.0,-0.005025,3793.0,1.173459e+04
...,...,...,...,...,...,...,...
3594905,93436,2021-08-31,735.71997,3812156.0,0.070605,1001767.0,7.370200e+08
3594906,93436,2021-09-30,775.47998,3889228.0,0.054042,1004000.0,7.785819e+08
3594907,93436,2021-10-31,1114.00000,5263955.0,0.436530,1004265.0,1.118751e+09
3594908,93436,2021-11-30,1144.76001,6457197.0,0.027612,1004265.0,1.149642e+09


Bin the stocks into portfolios

In [4]:
rd_firms['bin'] = rd_firms.groupby(['reconstitution_date'])['rdc_to_mkt_cap'].transform(lambda x: pd.qcut(x, 5, labels=['L', '2', '3', '4', 'H']))

Returns

In [5]:


# Every month

# -- Calculate returns (based on previous weights)
# -- Update the weights (based on the new returns)

# -- If it is reconstitution month
# -- -- Calculate new tickers
# -- -- Calculate new weights (based on market cap)

# Make a dateframe with 5 columns, one for each bin

dataPeriods = [
    monthly_stocks_df[pre_2000],
    monthly_stocks_df[post_2000],
    monthly_stocks_df[full_period],
    monthly_stocks_df[extended_period],
]

# Run for each period
for data in dataPeriods:
    print("For period: " + str(data['date'].min()) + " - " + str(data['date'].max()))
    log = pd.DataFrame(columns=['L', '2', '3', '4', 'H'])

    # Run for each bin
    for bin in ['L', '2', '3', '4', 'H']:
        returns = []
        currentStocks = {}

        # Iterate over each month in the period
        for index, group in data.groupby('date'):
            group.set_index('PERMNO', inplace=True)
            stocksPresent = list(set(currentStocks.keys()) & set(group.index))

            # Calculate returns
            ret = 0
            for lpermno, weight in currentStocks.items():
                if(lpermno not in stocksPresent):
                    continue
                ret += weight * group.loc[lpermno]['RET']

            # Update weights
            totalStocks = len(stocksPresent)
            for lpermno, weight in currentStocks.items():
                if(lpermno not in stocksPresent):
                    currentStocks[lpermno] = 0
                else:
                    currentStocks[lpermno] = 1 / totalStocks

            # If it is reconstitution month
            if index.month == 3 and index.day == 31:
                # Calculate new tickers
                newTickers = rd_firms[(rd_firms['reconstitution_date'] == index) & (rd_firms['bin'] == bin)]['PERMNO'].values
                currentStocks = {}
                
                stocksPresent = list(set(newTickers) & set(group.index))
                # Calculate new weights
                totalStocks = len(stocksPresent)
                for lpermno in stocksPresent:
                    currentStocks[lpermno] = 1 / totalStocks
                    

            returns.append((index, ret))
        log[bin] = pd.DataFrame(returns, columns=['date', 'ret']).set_index('date')

    
    # SUBTRACT RF and DISPLAY RESULTS ------------------------------------#

    # Merge log and three_factor_model on date
    log = log.merge(three_factor_model, left_index=True, right_on='Date')

    # Update columns L, 2, 3, 4, H to be the returns minus RF
    for bin in ['L', '2', '3', '4', 'H']:
        log[bin] = log[bin] - log['RF']

    # Calculate the mean of the returns
    means = log[['L', '2', '3', '4', 'H']].mean()
    # Multiply every values by 100
    means = means * 100
    display(means)

    # Create a HML column
    log['Port'] = log['H'] - log['L']

    # Regress HML on the three factors
    import statsmodels.api as sm
    X = log[['Mkt-RF', 'SMB', 'HML']]
    X = sm.add_constant(X)
    y = log['Port']
    model = sm.OLS(y, X).fit()
    predictions = model.predict(X)

    print("Regression summary for three factors")
    # Print out the summary
    display(model.summary())

    print("Regression summary against the market factor")
    # Regress HML on the market factor
    X = log[['Mkt-RF']]
    X = sm.add_constant(X)
    y = log['Port']
    model = sm.OLS(y, X).fit()
    predictions = model.predict(X)

    # Print out the summary
    display(model.summary())


    
    # ---------------------------------------------------------------------#


For period: 1981-07-31 00:00:00 - 1999-12-31 00:00:00


: 

: 

In [None]:
# Non RD firms
# Run for each period
for data in dataPeriods:
    print("For period: " + str(data['date'].min()) + " - " + str(data['date'].max()))
    log = pd.DataFrame(columns=['Non RD firms'])

    # Run for each bin
    for bin in ['Non RD firms']:
        returns = []
        currentStocks = {}

        # Iterate over each month in the period
        for index, group in data.groupby('date'):
            group.set_index('PERMNO', inplace=True)
            stocksPresent = list(set(currentStocks.keys()) & set(group.index))

            # Calculate returns
            ret = 0
            for lpermno, weight in currentStocks.items():
                if(lpermno not in stocksPresent):
                    continue
                ret += weight * group.loc[lpermno]['RET']

            # Update weights
            totalMarketCap = group.loc[stocksPresent].agg({'MKT_CAP': 'sum'}).values[0]
            for lpermno, weight in currentStocks.items():
                if(lpermno not in stocksPresent):
                    currentStocks[lpermno] = 0
                else:
                    currentStocks[lpermno] = group.loc[lpermno]['MKT_CAP'] / totalMarketCap

            # If it is reconstitution month
            if index.month == 3 and index.day == 31:
                # Calculate new tickers
                newTickers = non_rd_firms[(non_rd_firms['reconstitution_date'] == index)]['PERMNO'].values
                currentStocks = {}
                
                stocksPresent = list(set(newTickers) & set(group.index))
                # Calculate new weights
                totalMarketCap = group.loc[stocksPresent].agg({'MKT_CAP': 'sum'}).values[0]
                for lpermno in stocksPresent:
                    currentStocks[lpermno] = group.loc[lpermno]['MKT_CAP'] / totalMarketCap
                    

            returns.append((index, ret))
        log[bin] = pd.DataFrame(returns, columns=['date', 'ret']).set_index('date')

    
    # SUBTRACT RF and DISPLAY RESULTS ------------------------------------#

    # Merge log and three_factor_model on date
    log = log.merge(three_factor_model, left_index=True, right_on='Date')

    # Update columns L, 2, 3, 4, H to be the returns minus RF
    for bin in ['Non RD firms']:
        log[bin] = log[bin] - log['RF']

    # Calculate the mean of the returns
    means = log[['Non RD firms']].mean()
    # Multiply every values by 100
    means = means * 100
    display(means)

    # ---------------------------------------------------------------------#

For period: 1981-07-31 00:00:00 - 1999-12-31 00:00:00


Non RD firms    0.851335
dtype: float64

For period: 1999-12-31 00:00:00 - 2012-12-31 00:00:00


Non RD firms    0.29287
dtype: float64

For period: 1981-07-31 00:00:00 - 2012-12-31 00:00:00


Non RD firms    0.617375
dtype: float64

For period: 1981-07-31 00:00:00 - 2021-12-31 00:00:00


Non RD firms    0.702838
dtype: float64

In [None]:
# Create