In [5]:
import pandas as pd
import numpy as np
from IPython.display import clear_output
from dask import dataframe as dd

from src.model import model_fit, ensemble_predict, evaluate_n_members

In [9]:
# load data
X = pd.read_pickle('data/dataset_2.pkl')
X.set_index('year_month', inplace=True)

# define columns variables
cols = ['InvestPPEInv', 'ShareIss1Y', 'ShareRepurchase', 'DelCOA', 'dNoa', 'GrLTNOA', 'IntMom', 'LRreversal', 'Mom12m', 'Mom6m', 'MRreversal', 'ResidualMomentum', 'STreversal', 'AM', 'BMdec', 'ChEQ', 'AssetGrowth', 'ChNWC', 'DelEqu', 'NOA', 'Size', 'SP', 'AbnormalAccruals', 'Accruals', 'PctAcc', 'OPLeverage', 'BookLeverage', 'CF', 'cfp', 'DelCOL', 'DelFINL', 'IdioRisk', 'IdioVol3F', 'Leverage', 'Beta', 'BetaFP', 'BidAskSpread', 'DolVol', 'Illiquidity', 'PRC', 'VolMkt', 'VolSD', 'High52', 'MaxRet', 'CashProd', 'GP', 'roaq', 'RoE', 'DelLTI', 'CFNAI', 'P_I', 'EU_H', 'C_H', 'SO_I', 'ab_ret', 'ex_return', 'ewretx', 'vwretx', 'sprtrn', 'ab_ewretx', 'ab_vwretx', 'ab_sprtrn']

# firm feature
firm_features = ['InvestPPEInv', 'ShareIss1Y', 'ShareRepurchase', 'DelCOA', 'dNoa', 'GrLTNOA', 'IntMom', 'LRreversal', 'Mom12m', 'Mom6m', 'MRreversal', 'ResidualMomentum', 'STreversal', 'AM', 'BMdec', 'ChEQ', 'AssetGrowth', 'ChNWC', 'DelEqu', 'NOA', 'Size', 'SP', 'AbnormalAccruals', 'Accruals', 'PctAcc', 'OPLeverage', 'BookLeverage', 'CF', 'cfp', 'DelCOL', 'DelFINL', 'IdioRisk', 'IdioVol3F', 'Leverage', 'Beta', 'BetaFP', 'BidAskSpread', 'DolVol', 'Illiquidity', 'PRC', 'VolMkt', 'VolSD', 'High52', 'MaxRet', 'CashProd', 'GP', 'roaq', 'RoE', 'DelLTI']

# macro features
macro_features = ['CFNAI', 'P_I', 'EU_H', 'C_H', 'SO_I']

In [10]:
# split data
from sklearn.model_selection import train_test_split
train, X_test = train_test_split(X, test_size=1/3, shuffle=True)
X_train, X_val = train_test_split(train, test_size=1/2, shuffle=True)

In [15]:
# scale data (standerdized)
# fit scaler to train data, and apply it to validation and test data
from sklearn.preprocessing import StandardScaler
scl = StandardScaler()
X_train_scaled = scl.fit_transform(X_train[cols])
X_train_scaled = pd.DataFrame(X_train_scaled)
X_train_scaled.index = X_test.index
X_train_scaled.columns = cols

X_val_scaled = scl.transform(X_val[cols])
X_val_scaled = pd.DataFrame(X_val_scaled)
X_val_scaled.index = X_val.index
X_val_scaled.columns = cols

X_test_scaled = scl.transform(X_test[cols])
X_test_scaled = pd.DataFrame(X_test_scaled)
X_test_scaled.index = X_test.index
X_test_scaled.columns = cols

In [66]:
# portfolio sort on size (SMB)
X_test_scaled['size_qnt'] = X_test_scaled.groupby('year_month')['Size'].transform(lambda g: pd.qcut(g, q=5, labels=list(f'qnt_{i}' for i in range(1,6))))
portfo_SMB = X_test_scaled.groupby(['year_month', 'size_qnt']).mean(numeric_only=True).sort_index().reset_index()

# portfolio sort on book to market (HML)
X_test_scaled['value_qnt'] = X_test_scaled.groupby('year_month')['BMdec'].transform(lambda g: pd.qcut(g, q=5, labels=list(f'qnt_{i}' for i in range(1,6))))
portfo_HML = X_test_scaled.groupby(['year_month', 'value_qnt']).mean(numeric_only=True).sort_index().reset_index()

# portfolio sort on ROE (RMW)
X_test_scaled['profit_qnt'] = X_test_scaled.groupby('year_month')['RoE'].transform(lambda g: pd.qcut(g, q=5, labels=list(f'qnt_{i}' for i in range(1,6))))
portfo_HML = X_test_scaled.groupby(['year_month', 'profit_qnt']).mean(numeric_only=True).sort_index().reset_index()

# portfolio sort on asset growth (CMA)
X_test_scaled['invest_qnt'] = X_test_scaled.groupby('year_month')['AssetGrowth'].transform(lambda g: pd.qcut(g, q=5, labels=list(f'qnt_{i}' for i in range(1,6))))
portfo_HML = X_test_scaled.groupby(['year_month', 'invest_qnt']).mean(numeric_only=True).sort_index().reset_index()


In [72]:
portfo = 'qnt_1'
portfo_SMB[portfo_SMB['size_qnt']==portfo][firm_features + macro_features]

Unnamed: 0,InvestPPEInv,ShareIss1Y,ShareRepurchase,DelCOA,dNoa,GrLTNOA,IntMom,LRreversal,Mom12m,Mom6m,...,CashProd,GP,roaq,RoE,DelLTI,CFNAI,P_I,EU_H,C_H,SO_I
0,0.448229,0.168380,-0.858259,0.069334,0.242877,-0.014185,0.800791,0.759642,-0.325110,-0.854254,...,-0.007240,0.234997,-0.041138,0.005783,-0.164852,0.868059,0.938453,0.395964,1.048419,1.043319
5,0.266934,0.168380,-0.858259,0.256231,0.250821,0.142661,0.316198,0.804991,-0.424097,-0.306922,...,-0.009210,-0.200881,0.006800,0.004909,-0.085674,1.439553,2.254000,0.619278,0.895167,1.043319
10,0.308263,0.152263,-0.674313,0.095225,0.277602,-0.128582,-0.302831,0.738929,0.088801,0.414739,...,-0.012356,-0.200146,0.018226,0.003316,0.006911,0.419029,0.828824,0.445589,1.048419,-1.022641
15,0.127472,0.146019,-0.251237,0.168999,0.125133,-0.026165,-0.381884,0.597892,-0.275438,0.254085,...,-0.006581,0.034386,0.043457,0.004291,0.059538,1.051754,0.664380,0.594465,2.197807,1.289266
20,0.204923,0.115377,-0.858259,-0.130608,0.124962,-0.111600,-0.455296,0.415824,-0.225933,0.470336,...,-0.008532,-0.073758,0.077279,0.003908,0.042477,0.745597,0.911045,0.420777,0.971793,0.502234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2980,0.214893,-0.462852,-0.130890,0.344292,0.182773,-0.227648,1.387596,0.479789,0.522738,-0.345933,...,-0.073932,-0.762808,-0.593941,-0.003997,-0.014771,0.061846,-0.185244,0.346338,0.205535,-0.235609
2985,0.529960,-0.208607,-0.095815,0.459673,0.331249,-0.150843,1.524772,0.414813,0.548788,-0.387622,...,-0.084464,-0.748077,-0.588339,-0.016415,0.078655,-0.009591,-0.843018,0.495215,0.358786,0.256286
2990,0.341539,-0.301205,-0.119988,0.314705,0.149588,-0.143877,1.289774,0.427143,0.458925,-0.348679,...,-0.012038,-0.894226,-0.587314,-0.006293,0.005732,0.868059,1.294747,0.644091,0.512038,0.256286
2995,0.217575,-0.137937,-0.092645,0.420980,0.125264,-0.068160,0.905239,0.421142,-0.017030,-0.369312,...,-0.003254,-0.724151,-0.394147,-0.005021,-0.039960,0.541492,0.527344,0.718529,-0.024343,0.256286


In [None]:
r2_ex_portfo = list()

for i in range(1,6,1):
  portfo = f'qnt_{i}'
  ex_r2 = evaluate_n_members(model_ex, 10, portfo_SMB[portfo_SMB['size_qnt']==portfo][firm_features + macro_features], portfo_SMB['ex_return'])
  r2_ex_portfo.append(ex_r2)


In [None]:
r2_ex_portfo, yhats_ex_portfo = list(), list()
for i in range(0,3):
  # predict abnormal return in each tertiles
  yhats = ensemble_predict(model_ex, X_test_scaled[X_test_scaled['tertiles']==i][firm_features + macro_features])
  # evaluate performance in each tertiles
  r2 = r2_score(X_test_scaled[X_test_scaled['tertiles']==i]['ab_ret'], yhats)
  r2_ex_ter.append(r2)
  yhats_ex_ter.append(yhats)
clear_output()

In [69]:
portfo_SMB

Unnamed: 0,year_month,size_qnt,InvestPPEInv,ShareIss1Y,ShareRepurchase,DelCOA,dNoa,GrLTNOA,IntMom,LRreversal,...,C_H,SO_I,ab_ret,ex_return,ewretx,vwretx,sprtrn,ab_ewretx,ab_vwretx,ab_sprtrn
0,1971-12,qnt_1,0.448229,0.168380,-0.858259,0.069334,0.242877,-0.014185,0.800791,0.759642,...,1.048419,1.043319,0.599939,1.364627,1.733769,0.449815,0.253056,1.343501,0.297354,0.194890
1,1971-12,qnt_2,0.288131,0.163663,-0.858259,0.639961,0.315476,0.145936,0.789330,0.758812,...,1.048419,1.043319,0.189705,1.117307,1.733769,0.449815,0.253056,1.343501,0.297354,0.194890
2,1971-12,qnt_3,-0.064939,0.164852,-0.569201,-0.138416,0.076754,-0.230810,0.392544,0.636947,...,1.048419,1.043319,0.372059,1.118155,1.733769,0.449815,0.253056,1.343501,0.297354,0.194890
3,1971-12,qnt_4,0.012692,0.131160,0.008916,0.151332,0.124441,0.139331,0.259430,0.514547,...,1.048419,1.043319,-0.051746,0.185341,1.733769,0.449815,0.253056,1.343501,0.297354,0.194890
4,1971-12,qnt_5,-0.174547,0.154659,-0.099481,-0.179192,-0.028385,0.045091,0.366491,0.299813,...,1.048419,1.043319,0.137521,0.237044,1.733769,0.449815,0.253056,1.343501,0.297354,0.194890
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000,2021-12,qnt_1,0.252697,-0.137626,-0.057044,0.246670,0.173541,-0.226905,0.425391,0.309142,...,-0.713975,0.207097,0.372899,-0.613570,-1.396647,-1.447257,-1.306878,-1.673645,-1.712222,-1.563498
3001,2021-12,qnt_2,0.262223,0.042573,0.513313,0.261370,0.299810,-0.232912,1.102673,0.126464,...,-0.713975,0.207097,0.180225,-0.518776,-1.396647,-1.447257,-1.306878,-1.673645,-1.712222,-1.563498
3002,2021-12,qnt_3,0.220405,0.123067,0.621952,0.251263,0.178234,-0.075660,0.680686,-0.090010,...,-0.713975,0.207097,-0.020701,-0.562947,-1.396647,-1.447257,-1.306878,-1.673645,-1.712222,-1.563498
3003,2021-12,qnt_4,0.127550,-0.020215,0.621952,0.157597,0.040777,-0.176338,0.377733,-0.125777,...,-0.713975,0.207097,-0.023942,-0.558003,-1.396647,-1.447257,-1.306878,-1.673645,-1.712222,-1.563498
