In [101]:
import pandas as pd
import numpy as np

import datetime
from dateutil.relativedelta import relativedelta

from tqdm import tqdm

from joblib import delayed, Parallel 

import warnings
warnings.filterwarnings('ignore')

In [None]:
str(datetime.datetime.strptime(str(19870131),'%Y%m%d') - relativedelta(months=9))

In [7]:
class modelBase:
    def __init__(self, name):
        self.name = name
        self.train_idx = 0
        
        # initial train, valid and test periods are default accroding to original paper
        self.train_period = [19570101, 19741231] 
        self.valid_period = [19750101, 19861231]
        self.test_period  = [19870101, 19871231]
    
    
    def train(self):
        # print('trained')
        pass

    
    def calBeta(self, month):
        """
        Calculate specific month's beta. Should be specified by different models
        -> return np.array, dim = (N, K)
        """
        # return np.zeros([13000, 3])
        pass
    
        
    def calFactor(self, month):
        """
        Calculate specific month's factor. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        # return np.zeros([3, 1])
        pass    
       
    
    def cal_delayed_Factor(self, month):
        """
        Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        pass
    
    
    def inference(self, month):        
        assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
        
        mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ F_{K*1}
        return mon_beta @ mon_factor 
        
    
    def predict(self, month):
        assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}"
        
        lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1}  
        return mon_beta @ lag_factor
    
    
    def refit(self):
        self.train_period[1] += 10000
        self.valid_period = (pd.Series(self.valid_period) + 10000).to_list()
        self.test_period = (pd.Series(self.test_period) + 10000).to_list()

        self.train()
        
        self.train_idx += 1
        print(f'Model has been refitted [{self.train_idx}]')
        

In [99]:
datashare = pd.read_pickle('../data/datashare_re.pkl')

In [100]:
datashare

Unnamed: 0,permno,DATE,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,...,stdcf,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2
0,10006,19570329,-0.936035,0.044983,-0.378174,0.485840,0.590820,-0.637207,-1.000000,0.310547,...,0.000000,0.00,-0.672852,-0.985840,-0.475098,-0.682617,0.520020,-0.650391,-1.000000,37.0
1,10014,19570329,-0.997559,-0.802246,-0.945312,0.234497,-0.172729,0.162109,-1.000000,-0.228516,...,0.000000,0.00,-0.200806,-0.214355,0.285645,-0.162720,0.567871,-0.687988,-0.866699,35.0
2,10022,19570329,-0.993164,-0.144409,-0.558594,0.336914,-0.111572,-0.548340,-1.000000,0.073975,...,0.000000,0.00,-0.775391,-0.497803,-0.640137,-0.583984,0.651367,-0.767090,-1.000000,35.0
3,10030,19570329,-0.961426,-0.204224,-0.608887,0.158447,0.408447,-0.693848,-1.000000,0.298340,...,0.000000,0.00,-0.789551,-0.990234,-0.678711,-0.813477,0.223145,-0.689453,-1.000000,35.0
4,10057,19570329,-0.974121,0.165161,-0.247681,0.019379,0.152100,-0.625488,-1.000000,0.112305,...,0.000000,0.00,-0.839844,-0.978027,-0.870605,-0.882812,0.478271,-0.661621,-0.199951,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3760203,93428,20161230,-0.976562,-0.006817,-0.579102,-0.247070,0.548828,-0.539062,-0.552734,-0.281006,...,-1.000000,0.50,-0.633789,-1.000000,-0.882812,-0.746094,-0.717773,-0.921875,-1.000000,73.0
3760204,93429,20161230,-0.895508,-0.705566,-0.985352,0.005951,0.721680,-0.723145,-0.675293,-0.076721,...,-1.000000,0.50,-0.804199,-1.000000,-0.816406,-0.854004,-0.675293,-0.908203,-1.000000,62.0
3760205,93433,20161230,-0.999512,0.386230,-0.105835,0.095642,-0.059082,1.000000,-0.579102,0.320557,...,-0.909668,-0.50,0.904785,-0.981934,1.000000,1.000000,-0.032288,-0.574219,-1.000000,65.0
3760206,93434,20161230,-0.998535,-0.527832,-0.934570,-0.130615,0.050079,-0.359863,-0.554199,-0.384766,...,-0.999512,-0.25,-0.460205,-0.996094,-0.862793,-0.717285,-0.597656,-0.964355,-1.000000,1.0


In [None]:
fake_model = modelBase('fake')

In [None]:
fake_model.train()

In [None]:
fake_model.refit()

In [None]:
fake_model.predict(19880301).shape

In [None]:
class sonModel(modelBase):
    def __init__(self):
        super(sonModel, self).__init__('PCA')

In [None]:
pca_model = sonModel()

In [92]:
def stock_R_matrix(start_date, end_date):
    R_matrix = pd.read_pickle('../data/stock_R_matrix.pkl')
    return R_matrix.T.loc[start_date: end_date].T


def portfolio_R_matrix(start_date, end_date):
    portfolio_ret = pd.read_pickle('../data/portfolio_ret.pkl')
    return portfolio_ret.loc[(portfolio_ret['DATE'] >= start_date) & (portfolio_ret['DATE'] <= end_date)].set_index('DATE').T

In [85]:
all_matrix = stock_R_matrix(19570329, 20161230)

In [86]:
all_matrix.to_pickle('../data/stock_R_matrix.pkl')

In [None]:
r_matrix = stock_R_matrix(19570101, 19740101)

In [None]:
u, sigma, vt = np.linalg.svd((r_matrix - r_matrix.mean()).fillna(0))

In [96]:
class PCA(modelBase):
    def __init__(self, K):
        super(PCA, self).__init__(f'PCA_{K}')
        self.K = K
        
        
    def __col_de_mean(self, matrix):
        return (matrix - matrix.mean()).fillna(0)
    
        
    def calBeta(self, month):
        stock_r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))    
        u, sigma, vt = np.linalg.svd(stock_r_matrix)
        # B_{N*K}
        B = u[:, :self.K]
        return B
    
    
    def calFactor(self, month):
        stock_r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))    
        u, sigma, vt = np.linalg.svd(stock_r_matrix)
        # F_{K*1}
        F = np.diag(sigma[:self.K]) @ vt[:self.K, -1]
        return F
        
            
    def cal_delayed_Factor(self, month):
        stock_r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))    
        u, sigma, vt = np.linalg.svd(stock_r_matrix)
        # F_{K*1}
        lag_F = np.diag(sigma[:self.K]) @ vt[:self.K, :-1]
        return np.mean(lag_F, axis=1)

In [97]:
pca_1 = PCA(1)

In [98]:
pca_1.calFactor(19870228)

array([0.])

In [30]:
mon_list = pd.read_pickle('../data/mon_list.pkl')

In [63]:
test_list = mon_list.loc[mon_list >= pca_1.test_period[0]]
# test_list.columns = ['Mon']

In [62]:
test_list['Year'] = test_list['Mon'].apply(lambda x: x//10000)

In [83]:
for idx, g in enumerate(test_list.groupby(test_list.apply(lambda x: x//10000))):
    print(f'[Inferencing] Year: {g[0]}, idx: {idx+1}/{30} ')
    b =  g[1].to_list()
    
    # inference = Parallel(n_jobs=12)(delayed(pca_1.inference)(mon) for mon in tqdm(b))
    # break
    

[Inferencing] Year: 1987, idx: 1/30 
[Inferencing] Year: 1988, idx: 2/30 
[Inferencing] Year: 1989, idx: 3/30 
[Inferencing] Year: 1990, idx: 4/30 
[Inferencing] Year: 1991, idx: 5/30 
[Inferencing] Year: 1992, idx: 6/30 
[Inferencing] Year: 1993, idx: 7/30 
[Inferencing] Year: 1994, idx: 8/30 
[Inferencing] Year: 1995, idx: 9/30 
[Inferencing] Year: 1996, idx: 10/30 
[Inferencing] Year: 1997, idx: 11/30 
[Inferencing] Year: 1998, idx: 12/30 
[Inferencing] Year: 1999, idx: 13/30 
[Inferencing] Year: 2000, idx: 14/30 
[Inferencing] Year: 2001, idx: 15/30 
[Inferencing] Year: 2002, idx: 16/30 
[Inferencing] Year: 2003, idx: 17/30 
[Inferencing] Year: 2004, idx: 18/30 
[Inferencing] Year: 2005, idx: 19/30 
[Inferencing] Year: 2006, idx: 20/30 
[Inferencing] Year: 2007, idx: 21/30 
[Inferencing] Year: 2008, idx: 22/30 
[Inferencing] Year: 2009, idx: 23/30 
[Inferencing] Year: 2010, idx: 24/30 
[Inferencing] Year: 2011, idx: 25/30 
[Inferencing] Year: 2012, idx: 26/30 
[Inferencing] Year: 2

In [77]:
pd.DataFrame(inference).shape

(12, 13907)