In [1]:
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings("ignore")
from tqdm.auto import tqdm
import scipy
from scipy import stats
import statsmodels.formula.api as smf
from scipy.stats.mstats import winsorize
import statsmodels.api as sm
import pingouin as pg

# define data path
with open('data_path.txt') as f:
    lines = f.read().splitlines()
data_path = lines[0]

# load key data
f = open(data_path+'clean_data.pckl', 'rb')
mcdata, keydata, ids, returns, EU_4F, US_4F = pickle.load(f)
f.close()
keydata = keydata.drop(columns=["NumShares"])

# load betas
f = open(data_path+'betas.pckl', 'rb')
betas_onlyMkt, betas_4FBMG, betas_4F, betas_onlyBMG, betas_onlyBMG2, all_sample, bmg_sample = pickle.load(f)
f.close()

# load cross section data
f = open(data_path+'cross_section.pckl', 'rb')
cs_dict, lambdas_BMG, lambdas_BMG2, lambdas_co2 = pickle.load(f)
f.close()

# data dates
mc_date = pd.Timestamp("2021-09-30")
key_date = pd.Timestamp("2020-12-31")

# load bmg data
f = open(data_path+'bmg.pckl', 'rb')
bmg_factors_value, bmg_factors_equal, valid_dict = pickle.load(f)
f.close()

value_factors = bmg_factors_value[["Date","BMG","BMG2"]]

In [72]:
# last month of factor data
#end_date = pd.Timestamp("2021-06-30")
end_date = pd.Timestamp("2019-12-31")
latest_index = returns[returns.Date == end_date].index.item()

# determine valid series of returns and start_date
#earliest_start_date = pd.Timestamp("2007-07-31")
earliest_start_date = pd.Timestamp("2009-01-31")
earliest_index = returns[returns.Date == earliest_start_date].index.item()

for stock in ["XOM","AAPL.O","XOM","VWS.CO","GAZP.MM","AAPL.O","XOM","RDSa.AS","JNJ","MRK","GSK.L","AAPL.O"]:

    # market cap > 100
    if mcdata[mcdata.Date == end_date][stock].item() < 100:
        continue
    
    # determine return series and length, starting from nonzero return value
    data_length = 0
    start_index = 0
    for i in range(earliest_index,latest_index+1):
        curr_return = returns.loc[i,stock]
        zero_check = (-0.001 > curr_return) or (curr_return > 0.001)
        if zero_check and (start_index == 0):
            start_index = i
            start_date = returns.loc[i,"Date"]
        if (zero_check == False):
            start_index = 0
            start_date = 0

    if start_index != 0:
        data_length = latest_index - start_index

    # at least 24 months of returns
    if data_length < 24:
        continue

    # risk free rate and market return
    if ids[ids.RIC == stock]["Country"].item() == "USA":
        ff_data = US_4F.copy()
    else:
        ff_data = EU_4F.copy()
    ff_data = ff_data.loc[ff_data[ff_data.Date == start_date].index.item():ff_data[ff_data.Date == end_date].index.item()]
    ff_data = ff_data.reset_index(drop=True)

    # define factors
    x = value_factors.loc[value_factors[value_factors.Date == start_date].index.item():value_factors[value_factors.Date == end_date].index.item()]
    x = x.reset_index(drop=True)
    x["Mkt-RF"] = ff_data["Mkt-RF"]
    x["HML"] = ff_data["HML"]
    x["SMB"] = ff_data["SMB"]
    x["WML"] = ff_data["WML"]
    
    # Convert to floats
    x = x[["Mkt-RF","HML","SMB","WML","BMG","BMG2"]]
    x = x.astype(float)
        
    x_4FBMG = x[["Mkt-RF","HML","SMB","WML","BMG"]]
    x_onlyBMG = x[["Mkt-RF","BMG"]]
    
    x_4F = x[["Mkt-RF","HML","SMB","WML"]]
    
    # BMG2 Test
    x_onlyBMG2 = x[["Mkt-RF","BMG2"]]

    # define returns
    y = returns[[stock]].loc[returns[returns.Date == start_date].index.item():returns[returns.Date == end_date].index.item()] 
    y = y.reset_index(drop=True)
    # subtract risk free rate
    y[stock] = y[stock] - ff_data["RF"]
    
    #convert to floats
    y = y.astype(float)
    
    #Regression
    X = sm.add_constant(x_4FBMG)
    lm_4FBMG = sm.OLS(y,X).fit()
    lm_4F = pg.linear_regression(x_4F,y[stock])
    lm_onlyBMG = pg.linear_regression(x_onlyBMG,y[stock])
    lm_onlyBMG2 = pg.linear_regression(x_onlyBMG2,y[stock])

    # Save stats
    betas_4FBMG[stock] = lm_4FBMG
    betas_4F[stock] = lm_4F
    betas_onlyBMG[stock] = lm_onlyBMG
    
    #BMG2 Test
    betas_onlyBMG2[stock] = lm_onlyBMG2

lm_4FBMG.summary()

0,1,2,3
Dep. Variable:,AAPL.O,R-squared:,0.456
Model:,OLS,Adj. R-squared:,0.435
Method:,Least Squares,F-statistic:,21.15
Date:,"Thu, 16 Dec 2021",Prob (F-statistic):,2.62e-15
Time:,16:39:20,Log-Likelihood:,-413.4
No. Observations:,132,AIC:,838.8
Df Residuals:,126,BIC:,856.1
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.7339,0.535,1.371,0.173,-0.325,1.793
Mkt-RF,1.0137,0.143,7.097,0.000,0.731,1.296
HML,-0.3456,0.225,-1.533,0.128,-0.792,0.100
SMB,-0.2115,0.227,-0.933,0.353,-0.660,0.237
WML,-0.2276,0.121,-1.874,0.063,-0.468,0.013
BMG,-1.4361,0.279,-5.141,0.000,-1.989,-0.883

0,1,2,3
Omnibus:,15.879,Durbin-Watson:,1.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.288
Skew:,-0.646,Prob(JB):,1.45e-05
Kurtosis:,4.544,Cond. No.,5.92


In [76]:
## returns description
returns_list = []
mc_list = []
start_date = pd.Timestamp("2009-01-31")
end_date = pd.Timestamp("2019-12-31")
returns_sample = returns[returns.Date >= start_date][returns.Date <= end_date]
mcdata_sample = mcdata[mcdata.Date >= start_date][mcdata.Date <= end_date]

for stock in bmg_sample:
    r = list(returns_sample[stock])
    mc = list(mcdata_sample[stock])
    returns_list = returns_list + r
    mc_list = mc_list + mc
    

sample_data = pd.DataFrame()
sample_data["r"] = returns_list
sample_data["mc"] = mc_list
sample_returns = sample_data[sample_data.r != 0]
sample_mc = sample_data[sample_data.mc > 0]
print("Amount of return observations: "+str(len(sample_returns["r"])))
print("Mean returns: "+str(np.mean(sample_returns["r"])))
print("Median returns: "+str(np.median(sample_returns["r"])))
print("Std returns: "+str(np.std(sample_returns["r"])))

print("Amount of mc observations: "+str(len(sample_mc["mc"])))
print("Mean mc: "+str(np.mean(sample_mc["mc"])))
print("Median mc: "+str(np.median(sample_mc["mc"])))
print("Std mc: "+str(np.std(sample_mc["mc"])))




Amount of return observations: 205002
Mean returns: 1.5586664021754748
Median returns: 1.1658174235
Std returns: 25.921979592131507
Amount of mc observations: 206317
Mean mc: 14660.092353283639
Median mc: 3636.644732
Std mc: 40295.098088276034


In [51]:
### Data description
# build df

sample_df = pd.DataFrame()
sample_df["RIC"] = list(bmg_sample)

sample_df = sample_df.merge(ids)


#data_statistics
# num stocks in sample
num_stocks = len(sample_df.RIC)
print("Amount of stocks in sample: "+str(num_stocks))

num_countries = len(sample_df.Country.unique())
print("Amount of countries in sample: "+str(num_countries))


num_sectors = len(sample_df.Sector.unique())
print("Amount of sectors in sample: "+str(num_sectors))

keydata_bmg = sample_df.merge(keydata)
keydata_bmg = keydata_bmg[keydata_bmg.Date >= start_date]
keydata_bmg = keydata_bmg[keydata_bmg.Date <= end_date]
keystats = pd.DataFrame()

for col in list(keydata_bmg.columns)[6:]:
    series = keydata_bmg[[col]]
    series = series[series[col] != 0]
    series = pd.to_numeric(series[col])
    N = len(series)
    mean = np.mean(series)
    median = np.median(series)
    std = np.std(series)
    
    temp_df = pd.DataFrame({
    "Variable":[col],
    "N":[N],
    "Mean":[mean],
    "Median":[median],
    "Std":[std]})

    keystats = keystats.append(temp_df)

keystats = keystats.reset_index(drop=True)
keystats



Amount of stocks in sample: 1709
Amount of countries in sample: 25
Amount of sectors in sample: 10


Unnamed: 0,Variable,N,Mean,Median,Std
0,TotalCO2,11073,4768831.0,227000.0,16458900.0
1,MarketCap,17344,15088.89,3748.510742,42053.71
2,CO2Scope3,7534,11820970.0,28735.5,61118640.0
3,AnalyticCO2,11056,435.9952,44.198975,1789.197
4,Revenue,18686,10251.71,2420.434034,28133.93
5,ESG,15378,48.87424,51.911322,23.64018
6,BVpershare,18754,160.0226,8.890061,6623.844
7,BookValue,18085,5274.843,1402.0,13870.78
8,BtoM,17300,0.531478,0.415873,3.969855


In [53]:
ff_data_US = US_4F.copy()
ff_data_EU = EU_4F.copy()
ff_data_US = ff_data.loc[ff_data[ff_data.Date == start_date].index.item():ff_data[ff_data.Date == end_date].index.item()]
ff_data_US = ff_data.reset_index(drop=True)
ff_data_EU = ff_data.loc[ff_data[ff_data.Date == start_date].index.item():ff_data[ff_data.Date == end_date].index.item()]
ff_data_EU = ff_data.reset_index(drop=True)

ff_data_EU



Unnamed: 0,Date,Mkt-RF,SMB,HML,RF,WML
0,2009-01-31,-8.12,0.18,-11.11,0.00,-2.18
1,2009-02-28,-10.10,0.13,-6.91,0.01,4.41
2,2009-03-31,8.95,0.00,3.34,0.02,-11.87
3,2009-04-30,10.18,5.28,5.26,0.01,-34.30
4,2009-05-31,5.21,-2.52,0.37,0.00,-12.49
...,...,...,...,...,...,...
127,2019-08-31,-2.58,-2.32,-4.95,0.16,7.47
128,2019-09-30,1.43,-0.97,6.83,0.18,-6.81
129,2019-10-31,2.06,0.28,-1.93,0.15,0.19
130,2019-11-30,3.87,0.80,-2.02,0.12,-2.63


In [75]:
#### Correlations between factor returns
factors_df = bmg_factors_value[bmg_factors_value.Date >= start_date][bmg_factors_value.Date <= end_date]
factors_df = factors_df.reset_index(drop=True)
x_4FBMG.corr()


Unnamed: 0,Mkt-RF,HML,SMB,WML,BMG
Mkt-RF,1.0,0.33176,0.354609,-0.347701,-0.005768
HML,0.33176,1.0,0.126622,-0.438071,0.353098
SMB,0.354609,0.126622,1.0,-0.140655,0.006822
WML,-0.347701,-0.438071,-0.140655,1.0,-0.115381
BMG,-0.005768,0.353098,0.006822,-0.115381,1.0
