In [1]:
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
import pingouin as pg
from tqdm import tqdm

# define data path
with open('data_path.txt') as f:
    lines = f.read().splitlines()
data_path = lines[0]

# load data
f = open(data_path+'clean_data.pckl', 'rb')
mcdata, keydata, ids, returns, EU_4F, US_4F = pickle.load(f)
f.close()

# load bmg data
f = open(data_path+'bmg.pckl', 'rb')
bmg_factors_value, bmg_factors_equal, valid_dict = pickle.load(f)
f.close()

# load ff factors
# load data
f = open(data_path+'ff.pckl', 'rb')
ff_factors_value, ff_factors_equal = pickle.load(f)
f.close()

# merge factors
#value_factors = ff_factors_value[["Date","SMB","HML"]]
#value_factors["BMG"] = bmg_factors_value[["BMG"]]
value_factors = bmg_factors_value[["Date","BMG","BMG2"]]



In [26]:
#exclude financials
id_list = ids[ids.Sector != "Financials"]
id_list = id_list.RIC.unique()

# last month of factor data
#end_date = pd.Timestamp("2021-06-30")
end_date = pd.Timestamp("2019-12-31")
latest_index = returns[returns.Date == end_date].index.item()

# determine valid series of returns and start_date
#earliest_start_date = pd.Timestamp("2007-07-31")
earliest_start_date = pd.Timestamp("2009-01-31")
earliest_index = returns[returns.Date == earliest_start_date].index.item()

# create dicts
betas_4F = {}
betas_onlyBMG = {}
betas_onlyBMG2 = {}
betas_4FBMG = {}
betas_onlyMkt = {}
for stock in tqdm(id_list):

    # market cap > 100
    if mcdata[mcdata.Date == end_date][stock].item() < 100:
        continue
    
    # determine return series and length, starting from nonzero return value
    data_length = 0
    start_index = 0
    for i in range(earliest_index,latest_index+1):
        curr_return = returns.loc[i,stock]
        zero_check = (-0.001 > curr_return) or (curr_return > 0.001)
        if zero_check and (start_index == 0):
            start_index = i
            start_date = returns.loc[i,"Date"]
        if (zero_check == False):
            start_index = 0
            start_date = 0

    if start_index != 0:
        data_length = latest_index - start_index

    # at least 24 months of returns
    if data_length < 24:
        continue

    # risk free rate and market return
    if ids[ids.RIC == stock]["Country"].item() == "USA":
        ff_data = US_4F.copy()
    else:
        ff_data = EU_4F.copy()
    ff_data = ff_data.loc[ff_data[ff_data.Date == start_date].index.item():ff_data[ff_data.Date == end_date].index.item()]
    ff_data = ff_data.reset_index(drop=True)

    # define factors
    x = value_factors.loc[value_factors[value_factors.Date == start_date].index.item():value_factors[value_factors.Date == end_date].index.item()]
    x = x.reset_index(drop=True)
    x["Mkt-RF"] = ff_data["Mkt-RF"]
    x["HML"] = ff_data["HML"]
    x["SMB"] = ff_data["SMB"]
    x["WML"] = ff_data["WML"]
    
    # Convert to floats
    x = x[["Mkt-RF","HML","SMB","WML","BMG","BMG2"]]
    x = x.astype(float)
        
    x_4FBMG = x[["Mkt-RF","HML","SMB","WML","BMG"]]
    x_onlyBMG = x[["Mkt-RF","BMG"]]
    
    x_4F = x[["Mkt-RF","HML","SMB","WML"]]
    
    # BMG2 Test
    x_onlyBMG2 = x[["Mkt-RF","BMG2"]]
    
    # BMG2 Test
    x_onlyMkt = x[["Mkt-RF"]]

    # define returns
    y = returns[[stock]].loc[returns[returns.Date == start_date].index.item():returns[returns.Date == end_date].index.item()] 
    y = y.reset_index(drop=True)
    # subtract risk free rate
    y[stock] = y[stock] - ff_data["RF"]
    
    #convert to floats
    y = y.astype(float)
    
    #Regression
    lm_4FBMG = pg.linear_regression(x_4FBMG,y[stock])
    lm_4F = pg.linear_regression(x_4F,y[stock])
    lm_onlyBMG = pg.linear_regression(x_onlyBMG,y[stock])
    lm_onlyBMG2 = pg.linear_regression(x_onlyBMG2,y[stock])
    lm_onlyMkt = pg.linear_regression(x_onlyMkt,y[stock])

    # Save stats
    betas_4FBMG[stock] = lm_4FBMG
    betas_4F[stock] = lm_4F
    betas_onlyBMG[stock] = lm_onlyBMG
    betas_onlyMkt[stock] = lm_onlyMkt
    
    #BMG2 Test
    betas_onlyBMG2[stock] = lm_onlyBMG2



100%|██████████| 16464/16464 [01:18<00:00, 209.37it/s]


In [23]:
## Define the 2 different samples
# whole sample with valid regressions
all_sample = list(betas_4FBMG.keys())

# bmg sample (all stocks with CO2 data that were used to build the factors)
bmg_sample = []
for rebalance_date in valid_dict.keys():
    bmg_sample += list(valid_dict[rebalance_date].RIC)
# drop duplicates
bmg_sample = set(bmg_sample)
# only valid stocks with regressions
bmg_sample = bmg_sample.intersection(all_sample)

In [24]:
# Save Data for use in other files
f = open(data_path+'betas.pckl', 'wb')
pickle.dump([betas_onlyMkt, betas_4FBMG, betas_4F, betas_onlyBMG,betas_onlyBMG2, all_sample, bmg_sample], f)
f.close()