In [1]:
import pandas as pd
import numpy as np

import datetime
from dateutil.relativedelta import relativedelta

from tqdm import tqdm

from joblib import delayed, Parallel 

import sys
sys.path.append('../')
from utils import charas

import warnings
warnings.filterwarnings('ignore')

In [128]:
class modelBase:
    def __init__(self, name):
        self.name = name
        self.train_idx = 0
        
        # initial train, valid and test periods are default accroding to original paper
        self.train_period = [19570101, 19741231]
        self.valid_period = [19750101, 19861231]
        self.test_period  = [19870101, 19871231]
    
    
    def train_model(self):
        # print('trained')
        pass

    
    def calBeta(self, month):
        """
        Calculate specific month's beta. Should be specified by different models
        -> return np.array, dim = (N, K)
        """
        # return np.zeros([13000, 3])
        pass
    
        
    def calFactor(self, month):
        """
        Calculate specific month's factor. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        # return np.zeros([3, 1])
        pass    
       
    
    def cal_delayed_Factor(self, month):
        """
        Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        pass
    
    
    def inference(self, month):       
        assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
        
        mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ F_{K*1}
        return mon_beta @ mon_factor
        
    
    def predict(self, month):
        assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}"
        
        lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1}  
        return mon_beta @ lag_factor
    
    
    def refit(self):
        self.train_period[1] += 10000
        self.valid_period = (pd.Series(self.valid_period) + 10000).to_list()
        self.test_period = (pd.Series(self.test_period) + 10000).to_list()
        

In [3]:
def stock_R_matrix(start_date, end_date):
    R_matrix = pd.read_pickle('../data/stock_R_matrix.pkl')
    return R_matrix.T.loc[start_date: end_date].T

def portfolio_R_matrix(start_date, end_date):
    portfolio_ret = pd.read_pickle('../data/portfolio_ret.pkl')
    return portfolio_ret.loc[(portfolio_ret['DATE'] >= start_date) & (portfolio_ret['DATE'] <= end_date)].set_index('DATE').T

In [None]:
class PCA(modelBase):
    def __init__(self, K, portfolio=True):
        super(PCA, self).__init__(f'PCA_{K}')
        self.K = K
        self.portfolio = portfolio
        
        
    def __col_de_mean(self, matrix):
        return (matrix - matrix.mean()).fillna(0)
    
        
    def inference(self, month):
        if self.portfolio:
            r_matrix = self.__col_de_mean(portfolio_R_matrix(self.train_period[0], month)).astype(np.float32)
        else:
            r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))   
        u, sigma, vt = np.linalg.svd(r_matrix)
        # B_{N*K}
        B = u[:, :self.K]
        # F_{K*1}
        F = np.diag(sigma[:self.K]) @ vt[:self.K, -1]
        return B @ F
        
            
    def predict(self, month):
        if self.portfolio:
            r_matrix = self.__col_de_mean(portfolio_R_matrix(self.train_period[0], month)).astype(np.float32)
        else:
            r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))
        u, sigma, vt = np.linalg.svd(r_matrix)
        # B_{N*K}
        B = u[:, :self.K]
        # F_{K*1}
        lag_F = np.diag(sigma[:self.K]) @ vt[:self.K, :-1]
        return B @ np.mean(lag_F, axis=1)

In [None]:
pca_1 = PCA(1)

In [4]:
import statsmodels.api as sm

class FF(modelBase):
    def __init__(self, K, portfolio=True):
        super(FF, self).__init__(f'FF_{K}')
        self.K = K
        self.portfolio = portfolio
        self.train_period[0] = 19630731 # ff5 data from FF website is only available from 196307
        self.__prepare_FFf()
        
    
    def __prepare_FFf(self):
        ff5 = pd.read_csv('../data/ff5.csv', index_col=0)
        UMD = pd.read_csv('../data/UMD.csv', index_col=0)
        UMD.columns = ['UMD']
        FFf = pd.concat([ff5, UMD.loc[196307:]], axis=1)
        self.fname = ['Mkt-RF', 'SMB', 'HML', 'CMA', 'RMW', 'UMD']
        self.FFf = FFf[self.fname]
        self.portfolio_ret = pd.read_pickle('../data/portfolio_ret.pkl')
        self.portfolio_ret['DATE'] = self.portfolio_ret['DATE'].apply(lambda x: x//100)
        
    
    def train(self):
        self.beta_matrix = []
        X = self.FFf[self.fname[:self.K]].loc[self.train_period[0]//100:self.train_period[1]//100]
        for col in charas:
            y = self.portfolio_ret.set_index('DATE')[col].loc[self.train_period[0]//100:self.train_period[1]//100]
            model = sm.OLS(y.values, X.values).fit()
            self.beta_matrix.append(model.params)
        self.beta_matrix = pd.DataFrame(self.beta_matrix, columns=self.fname[:self.K], index=charas)
    
        
    def calBeta(self, month):
        return self.beta_matrix # N * K
        
            
    def calFactor(self, month):
        return self.FFf[self.fname[:self.K]].loc[month//100] # K * 1
        
        
    def cal_delayed_Factor(self, month):
        last_mon = int(str(pd.to_datetime(str(month)) - relativedelta(months=1)).split(' ')[0].replace('-', '')[:-2])
        return self.FFf[self.fname[:self.K]].loc[self.valid_period[0]//100:last_mon].mean()
        


In [186]:
from ipca import InstrumentedPCA

class IPCA(modelBase):
    def __init__(self, K, portfolio=True):
        super(IPCA, self).__init__(f'IPCA_{K}')
        self.K = K
        self.portfolio = portfolio
        self.__prepare_data()

    def __prepare_data(self):
        self.p_charas = pd.read_pickle('data/p_charas.pkl')
        portfolio_ret=  pd.read_pickle('data/portfolio_ret.pkl')
        self.p_charas['p_ret'] = np.zeros(self.p_charas.shape[0])
        self.train_p_charas = self.p_charas.loc[self.p_charas.DATE <= self.test_period[1]].copy(deep=False).reset_index().set_index(['index', 'DATE']).sort_index()
        for chr in charas:
            self.train_p_charas.loc[f'p_{chr}', 'p_ret'] = portfolio_ret.loc[portfolio_ret.DATE <= self.test_period[1]][chr].values
        
        self.p_charas = p_charas
        
        
    def train_model(self):
        y = self.train_p_charas['p_ret']
        X = self.train_p_charas.drop('p_ret', axis=1)

        regr = InstrumentedPCA(n_factors=1, intercept=False)
        regr = regr.fit(X=X, y=y)
        self.Gamma, self.Factors = regr.get_factors(label_ind=True)
        
    
    def calBeta(self, month):
        return self.p_charas.loc[p_charas.DATE == month][charas].values @ self.Gamma.values # (N * P) @ (P * K) -> (N, K)
    
    def calFactor(self, month):
        return self.Factors.values[:, -1] # K * 1

    def cal_delayed_Factor(self, month):
        return np.mean(self.Factors.values[:, :-1], axis=1)
    

In [187]:
ipca_1 = IPCA(1)

In [188]:
ipca_1.train_model()



The panel dimensions are:
n_samples: 94 , L: 94 , T: 370




Step 1 - Aggregate Update: 83.93888654724135
Step 2 - Aggregate Update: 17.472227222429947
Step 3 - Aggregate Update: 205.83398368455028
Step 4 - Aggregate Update: 2.3609990901344773
Step 5 - Aggregate Update: 1.3695223659476312
Step 6 - Aggregate Update: 0.7987212150883138
Step 7 - Aggregate Update: 0.4710586259590883
Step 8 - Aggregate Update: 0.284286210486286
Step 9 - Aggregate Update: 0.1794683748133039
Step 10 - Aggregate Update: 0.1135091998400668
Step 11 - Aggregate Update: 0.07518434413177744
Step 12 - Aggregate Update: 0.051149991246916215
Step 13 - Aggregate Update: 0.035559386628882805
Step 14 - Aggregate Update: 0.025131531970835397
Step 15 - Aggregate Update: 0.017975260337323107
Step 16 - Aggregate Update: 0.012963877576218863
Step 17 - Aggregate Update: 0.009401266968502853
Step 18 - Aggregate Update: 0.0068413612764359755
Step 19 - Aggregate Update: 0.004988550443734141
Step 20 - Aggregate Update: 0.003641227936519442
Step 21 - Aggregate Update: 0.002658712269877128
St

In [189]:
ipca_1.p_charas.loc[p_charas.DATE == 19871231][charas]

Unnamed: 0,absacc,acc,age,agr,bm,bm_ia,cashdebt,cashpr,cfp,cfp_ia,...,mom1m,mom36m,mom6m,mvel1,pricedelay,retvol,std_dolvol,std_turn,turn,zerotrade
p_absacc,0.481606,0.236818,-0.174510,-0.071504,-0.002495,-0.001688,0.002606,0.008795,-0.015107,-0.011110,...,-0.033751,0.011034,-0.034480,-0.055749,-0.000260,0.071590,0.038928,-0.000075,-0.000139,0.021115
p_acc,0.314403,0.416288,-0.078849,-0.107495,-0.000395,0.000468,0.003900,0.005394,-0.022115,-0.013142,...,-0.033855,0.071843,-0.019273,-0.009144,-0.007571,0.013448,-0.016144,0.006126,0.006573,-0.026393
p_age,-0.009232,-0.011069,0.927878,-0.002603,0.002074,0.002113,0.001659,-0.016321,-0.036340,-0.023360,...,0.026314,0.043107,0.080383,0.216489,-0.011311,-0.134114,-0.161412,-0.001300,0.007175,-0.107275
p_agr,-0.077018,-0.154301,0.063687,0.330684,-0.003004,0.000707,-0.008112,-0.007155,-0.023276,-0.005925,...,0.009350,-0.202119,0.040322,-0.021248,0.009032,0.041632,0.076987,-0.016038,-0.023820,0.112495
p_bm,-0.063829,-0.003429,0.143272,0.039306,0.029460,0.018273,0.004994,-0.039638,-0.010766,-0.001505,...,0.009477,-0.074949,0.050106,-0.006490,0.002405,-0.043400,0.046997,0.014943,0.001747,0.099971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p_retvol,0.050699,0.015327,-0.181837,-0.014764,-0.000316,-0.001189,-0.003650,0.008081,-0.004288,0.000025,...,-0.069470,-0.039941,-0.193675,-0.062951,0.001306,0.624649,0.037539,0.023190,0.011169,-0.188641
p_std_dolvol,0.024334,-0.008022,-0.292864,0.028287,0.001743,0.002517,-0.002049,-0.001145,-0.002614,-0.000058,...,0.034992,-0.071787,-0.008087,-0.229896,0.015349,0.084031,0.608947,0.039713,-0.018466,0.265372
p_std_turn,0.012104,0.019422,0.004350,-0.024035,0.013674,0.008703,0.001230,0.000401,0.003444,0.000345,...,-0.010339,0.031311,-0.058396,0.005027,-0.005412,0.126038,0.093460,0.159242,0.084730,-0.377140
p_turn,0.016746,0.019991,0.066974,-0.053951,0.013442,0.007335,0.000617,0.003872,0.005057,0.001998,...,-0.021564,0.084520,-0.031336,0.041814,-0.017222,0.069496,-0.140386,0.101544,0.127989,-0.538382


In [178]:
ipca_1.calBeta(19871231).shape

(94, 1)

In [190]:
ipca_1.calFactor(19871231)

array([-12.39745503])

In [191]:
ipca_1.inference(ipca_1.test_period[1])

array([-0.552309  , -0.1358724 ,  1.26482361, -0.33580301,  0.49575155,
        0.36668021,  0.91436532, -0.60771261,  0.61740301,  0.26614827,
       -0.17064198, -0.13199311,  0.17878339, -0.19056485,  0.1872946 ,
       -0.51765842, -0.50677094, -0.97166959, -0.27582025, -0.19797829,
        0.94920043,  0.52493149,  1.27197961, -0.02312139,  0.21879006,
       -0.03059484, -0.57367049, -0.00312371, -0.20368752,  0.78210276,
       -0.04116753,  0.68594271,  1.06113788, -0.17666862,  0.0099045 ,
        0.35944456, -0.24197158,  0.34890156,  0.42797523,  0.0266955 ,
       -0.11938838,  0.12095885,  0.12071087,  0.24222388,  0.65321208,
       -0.34703963, -0.81419924, -0.74127465, -1.0586386 ,  0.3267056 ,
        0.93330528,  0.31793601,  0.71903106,  0.17186807, -0.50954839,
       -0.25528631,  0.01198529, -0.31921637,  0.62369007, -0.5846925 ,
        0.32655848,  0.03613854, -0.51751702, -0.55588055,  0.08729769,
        0.13798004,  0.15032936, -0.24555692,  0.50313824, -1.27