In [1]:
import dask.dataframe as dd
import time
from datetime import datetime
import pandas as pd
import numpy as np
from scipy.stats import mstats
#from progressbar import *
import matplotlib.pyplot as plt
import os 
import gc

In [2]:
if not os.path.exists('./Stats_Detail_New/'):
    os.makedirs('./Stats_Detail_New/')
if not os.path.exists('./Stats_Detail_New/multiway'):
    os.makedirs('./Stats_Detail_New/multiway')    
    
if not os.path.exists('./Weight/multiway'):
    os.makedirs('./Weight/multiway')
if not os.path.exists('./Graph_new/multiway'):
    os.makedirs('./Graph_new/multiway')
if not os.path.exists('./stats/multiway'):
    os.makedirs('./stats/multiway')

# 指標設定function

In [3]:
#數量前10大產業
def select_top10_ind(data):
    data_MV = data[data.MV_Select == 1]
    #倒敘排列大小順序
    select_ind = data_MV.groupby('Ind')['Code'].count().sort_values(ascending = False) 
    # why need to > 4?? -> 剔除少於4檔個股的產業
    select_ind = select_ind[select_ind > 4]
    # 前10大之Ind -> to list
    select_ind = select_ind.index[0:10].tolist()
    #return 1 or 0
    return [ ((x in select_ind) & ( y in data_MV['Code'].tolist())) * 1 
            for x, y in zip(data['Ind'], data['Code'])]

#some setting to calculate 因子值加權法
#極端值調整平均數 -> 去極值
#if s is all nan -> return s
#else -> 調整後1%、前1%的數值
def winsorize(s):
    if  sum(pd.isna(s) == True) == len(s):
        return s
    else:   
        return mstats.winsorize(s, limits=[0.01, 0.01])

#normalize to 0~1 range
def min_max(s):
    if sum(pd.isna(s) == True) == len(s):
        return s
    else:   
        return (s - np.min(s)) / (np.max(s) - np.min(s))

#標準化    
def stardize(s):
    if sum(pd.isna(s)==True)==len(s):
        return s
    else:   
        return (s-np.mean(s))/ np.std(s)  

#調整極端值後，進行normalize to 0~1    
def min_max_win(df,factor):
    df[factor+'_t-1'] = winsorize(df[factor+'_t-1'])
    df[factor+'_t-1'] = min_max(df[factor+'_t-1'])
    return df[factor+'_t-1']

#調整極端值後，進行標準化
def standardize_win(df,factor):
    df[factor+'_t-1'] = winsorize(df[factor+'_t-1'])
    df[factor+'_t-1'] = stardize(df[factor+'_t-1'])
    return df[factor+'_t-1']

#持有總值(Amount)、numbers of stocks(除以1000)
def Cal_Amount(df,factor,weight_method):
    df[factor+'_Nstock_'+weight_method] = [initial_account/2*x/y/1000 for x,y in zip(df[factor+'_Weight_'+weight_method],
                                                                                     df['Close'])]
    df[factor+'_Nstock_'+weight_method] = df[factor+'_Nstock_'+weight_method].apply(np.ceil)
    df[factor+'_Amount_'+weight_method] = df[factor+'_Nstock_'+weight_method]*df['Close']*1000    
    return df[factor+'_Nstock_'+weight_method],df[factor+'_Amount_'+weight_method]
#Profit
def Cal_Profit(df, factor, weight_method):
    df[factor+'_Profit_'+weight_method] = ((df['Close_t+1']-df['Close']) * df[factor+'_Nstock_'+weight_method])*1000
    # ex: 'short_index' in 'tri_short_index' -> True
    if 'short_index' in weight_method:
        #Rm -> 台灣發行量加權股價報酬指數 pct change
        Rm = df['Rm'].mode()
        Profit = df[factor + '_Profit_' + weight_method].sum() - initial_account * Rm/2
    elif 'long index' in weight_method:
        Rm = df['Rm'].mode()
        Profit = df[factor + '_Profit_' + weight_method].sum() + initial_account * Rm/2
    else:
        Profit = df[factor+'_Profit_'+weight_method].sum()
    return Profit

#Tax Cost
def Cal_Cost(df, factor, weight_method):
    #今天手上的股票數量-昨天的，再*股價
    holdsell_shortmore = df['Close']*(df[factor+'_Nstock_'+weight_method] - df[factor+'_Nstock_'+weight_method+'_t-1'])
    #今天的量比昨天少(<0) -> 賣出股票 -> tax
    holdsell_shortmore = abs(holdsell_shortmore[holdsell_shortmore < 0])
    df[factor + '_Tax_' + weight_method] = holdsell_shortmore
    trade_tax = np.sum(holdsell_shortmore*0.003*1000)
    return trade_tax

#Turnover(周轉率)
def Cal_Turnover(df,factor,weight_method):
    Turnover = np.sum(abs(df['Close']*(df[factor+'_Nstock_'+weight_method] - df[factor+'_Nstock_'+weight_method+'_t-1'])))*1000
    return Turnover

#計算權重
#1.排序權重法
#因子值加權、因子值排序加權
def cal_3group_weight_tri(df, factor, upper_pec, lower_pec, longbig, short_index=False, long_index=False, value=False):
    df = df.sort_values(factor+'_t-1', ascending=False)
    #給予排序
    df[factor+'_Rank'] = np.arange(len(df)) + 1
    length = len(df)
    middle = np.ceil(length /2)
    #因子值加權
    if value == True:
        #先去極值、標準化
        df[factor+'_t-1'] = winsorize(df[factor+'_t-1'])
        df[factor+'_t-1'] = min_max(df[factor+'_t-1'])
        #從中間切一半，分成large(1~100)、small(101~200) -> PE_t-1
        large_Rank = df[df[factor+'_Rank'] <= middle][factor+'_t-1']
        small_Rank = df[df[factor+'_Rank'] > middle][factor+'_t-1']
        #都為正數
        large_Rank = large_Rank    
        small_Rank = 1 - small_Rank
    
    #因子值排序加權
    else:                     
        #從中間切一半，分成large(1~100)、small(101~200) -> PE_Rank
        large_Rank = df[df[factor+'_Rank'] <= middle][factor+'_Rank'] 
        small_Rank = df[df[factor+'_Rank'] > middle][factor+'_Rank']
        #數值最大的逆著排 -> 100 99 98....1   #最大是一
        large_Rank = middle + 1 - large_Rank
        #因子數值最小的順著排 -> 1 2 3....100 #最小是一
        small_Rank = small_Rank - middle                             
    #買因子大的，賣因子小的 (一買一賣 -> delta對消的概念)
    if longbig == 1:
        #買大因子且放空指數 -> 因子小的權重為0
        if short_index == True:
            large_Rank = large_Rank / (large_Rank.sum())  
            small_Rank = small_Rank / (small_Rank.sum()) * 0 
        #賣小因子且做多指數 -> 因子大的權重為0
        elif long_index == True:
            large_Rank = large_Rank / (large_Rank.sum()) * 0
            small_Rank = small_Rank / (small_Rank.sum())              
        #不做指數，因此large、small各一半
        else:
            large_Rank = large_Rank / (large_Rank.sum())  
            small_Rank = small_Rank / (small_Rank.sum())             
        return pd.concat([large_Rank, -small_Rank]), df[factor+'_Rank'] 
    #賣因子大的，買因子小的
    else:
        #買小因子，且放空指數 -> 因子大的權重為0
        if short_index == True: 
            large_Rank = large_Rank / (large_Rank.sum()) * 0 
            small_Rank = small_Rank / (small_Rank.sum()) 
        #賣大因子，且做多指數 -> 因子小的權重為0
        elif long_index == True:
            large_Rank = large_Rank / (large_Rank.sum()) 
            small_Rank = small_Rank / (small_Rank.sum()) * 0 
        #不做指數，因此large、small各一半
        else:
            large_Rank = large_Rank / (large_Rank.sum())  
            small_Rank = small_Rank / (small_Rank.sum())  
        return  pd.concat([-large_Rank, small_Rank]), df[factor+'_Rank']

#2.平均權重法
#因子值加權、因子值排序加權
def cal_3group_weight_eq(df, factor, upper_pec, lower_pec, longbig, ind=False, short_index=False, long_index=False, value=False):
    df = df.sort_values(factor+'_t-1', ascending=False)
    df[factor+'_Rank'] = np.arange(len(df)) + 1    
    length = len(df)
    if ind == False:
        #upper、lower pecentage -> 看要取因子最大、最小的前幾% ex: 0.1、0.9
        upper = int(length*upper_pec)
        #numbers of upper
        N_upper = upper
        lower = int(length*lower_pec)
        #numbers of lower
        N_lower = length - lower
    #long/short因子值最大的2檔個股，並short/long因子值最小的2檔個股 -> eq_ind特有
    else:
        upper = 2
        lower = len(df) - 2
    #因子值加權
    if value == True: 
        #先去極值、標準化
        df[factor+'_t-1'] = winsorize(df[factor+'_t-1'])
        df[factor+'_t-1'] = min_max(df[factor+'_t-1'])
        #large rank -> 前幾%的PE_t-1 ex:取rank <= 200*0.1=20 -> 等同於取前10%
        large_Rank = df[df[factor+'_Rank'] <= upper][factor+'_t-1']
        #middle rank ?? -> 不配給權重
        middle_Rank = df[(lower >= df[factor+'_Rank'])&(df[factor+'_Rank'] > upper)][factor+'_t-1']*0
        #small rank -> ex:200*0.9 ->180， 取rank >180 -> 等同於取後10%
        small_Rank = df[df[factor+'_Rank'] > lower][factor+'_t-1']
        large_Rank = large_Rank
        large_Rank = large_Rank / large_Rank.sum()
        middle_Rank = middle_Rank * 0
        small_Rank = 1 - small_Rank 
        small_Rank = small_Rank / small_Rank.sum()
    #因子值排序加權
    else:            
        #取前10%的rank
        large_Rank = df[df[factor+'_Rank'] <= upper][factor+'_Rank']
        #equal weighted
        large_Rank = large_Rank/large_Rank/len(large_Rank)
        #中間的 ex:21~180，這裡不配給權重
        middle_Rank = df[(lower>=df[factor+'_Rank'])&(df[factor+'_Rank'] > upper)][factor+'_Rank']
        #取後10%的rank
        small_Rank = df[df[factor+'_Rank'] > lower][factor+'_Rank']
        #equal weighted
        small_Rank = small_Rank/small_Rank/len(small_Rank)            
    #買因子大的，賣因子小的 (一買一賣 -> delta對消的概念)
    if longbig == 1:
        #買大因子且放空指數 -> 因子小的權重為0
        if short_index == True:
            large_Rank  = large_Rank
            middle_Rank = middle_Rank  *0
            small_Rank  = small_Rank   *0    
        #賣小因子且做多指數 -> 因子大的權重為0
        elif long_index == True:
            large_Rank  = large_Rank   *0
            middle_Rank = middle_Rank  *0
            small_Rank  = small_Rank                   
        #不做指數，因此large、small各一半
        else:
            large_Rank  = large_Rank
            middle_Rank = middle_Rank  *0
            small_Rank  = small_Rank
        return pd.concat([large_Rank, middle_Rank, -small_Rank]), df[factor+'_Rank']
    #賣因子大的，買因子小的
    else:         
        #買小因子，且放空指數 -> 因子大的權重為0
        if short_index == True:
            large_Rank  = large_Rank   *0   
            middle_Rank = middle_Rank  *0
            small_Rank  = small_Rank
        #賣大因子，且做多指數 -> 因子小的權重為0
        elif long_index == True:
            large_Rank  = large_Rank      
            middle_Rank = middle_Rank  *0
            small_Rank  = small_Rank   *0         
        #不做指數，因此large、small各一半
        else:
            large_Rank  = large_Rank
            middle_Rank = middle_Rank  *0
            small_Rank  = small_Rank
        return pd.concat([-large_Rank, middle_Rank, small_Rank]), df[factor+'_Rank']

# main strategy function 

In [55]:
def Cal_Weight(weight_method, df_MV, df_Ind, factor, longbig, Factor_args):
    #df_Ind ->前10大產業，此外並剔除少於4檔個股的產業
    less_stock_Ind = (df_Ind.groupby('Ind')[factor+'_t-1'].count() > 4).index.tolist()
    df_Ind = df_Ind[[x in less_stock_Ind for x in df_Ind['Ind']]]
    #共有幾個產業符合標準
    N_Ind_Select  = len(df_Ind['Ind'].unique()) 
    #依據權重選取方法使用上下界參數(eq or tri)
    if 'tri' in weight_method:                  
        upper_pec, lower_pec = Factor_args['upper_lower_perc_tri']['upper'], Factor_args['upper_lower_perc_tri']['lower']
    else:
        upper_pec, lower_pec = Factor_args['upper_lower_perc_eq']['upper'], Factor_args['upper_lower_perc_eq']['lower']
    
    ## main strategy
    ## 因子值排序加權
    #排序權重法
    if weight_method == 'tri':
        df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank'] = cal_3group_weight_tri(df_MV, factor, upper_pec, lower_pec, longbig)
    #排序權重法(考慮前10大產業產業)
    elif weight_method == 'tri_ind':
        #依照各個產業去分別分配權重 -> groupby('Ind')
        weight = df_Ind.groupby('Ind', as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor,upper_pec,
                                                                                             lower_pec,longbig)[0])
        #依照各個產業去分別排序
        rank = df_Ind.groupby('Ind', as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor,upper_pec,
                                                                                             lower_pec,longbig)[1])
        #上述求出的weight，再除回各個產業的個數 -> 真正的weight 
        df_MV[factor+'_Weight_'+weight_method] = pd.Series(weight.reset_index(level=0, drop=True)/N_Ind_Select)
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    #排序權重法(考慮產業、產業市值) 
    elif weight_method == 'tri_ind_cap' :
        weight = df_Ind.groupby('Ind', as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor,upper_pec,
                                                                                             lower_pec,longbig)[0]) 
        rank = df_Ind.groupby('Ind', as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor,upper_pec,
                                                                                             lower_pec,longbig)[1])        
        df_MV[factor+'_Weight_'+weight_method] = pd.Series(weight.reset_index(level=0, drop=True))
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
        #最後 * 之前算出的Ind_CAP_Weight -> 依據產業市值分配產業權重 
        df_MV[factor+'_Weight_'+weight_method] = df_MV[factor+'_Weight_'+weight_method]*df_MV["Ind_CAP_Weight"]
    
    #平均權重法
    elif weight_method == 'eq':
        df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank'] = cal_3group_weight_eq(df_MV,factor,upper_pec,lower_pec,longbig
                                                                                            ,ind=False)
    #平均權重法(考慮前10大產業)
    elif weight_method == 'eq_ind':
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor,upper_pec,
                                                                                            lower_pec,longbig,ind=True)[0] )
        rank = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor,upper_pec,
                                                                                            lower_pec,longbig,ind=True)[1] )
        #上述求出的weight，再除回各個產業的個數 -> 真正的weight 
        df_MV[factor+'_Weight_'+weight_method] = pd.Series(weight.reset_index(level=0, drop=True)/N_Ind_Select)  
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    
    #放空指數區
    #tri + short
    elif weight_method == 'tri_short_index':
        df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank'] = cal_3group_weight_tri(df_MV, factor, upper_pec, lower_pec
                                                                                             , longbig, short_index=True)
    #tri_ind + short
    elif weight_method == 'tri_ind_short_index':
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor,upper_pec
                                                                                                            ,lower_pec,longbig,short_index=True)[0])  
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor,upper_pec
                                                                                                            ,lower_pec,longbig,short_index=True)[1])   
        df_MV[factor+'_Weight_'+weight_method] = pd.Series(weight.reset_index(level=0, drop=True)/N_Ind_Select)
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    #tri_ind_cap + short 
    elif weight_method == 'tri_ind_cap_short_index':
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                        ,upper_pec,lower_pec,longbig,short_index=True)[0])  
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                        ,upper_pec,lower_pec,longbig,short_index=True)[1])   
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True))
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
        df_MV[factor+'_Weight_'+weight_method] = df_MV[factor+'_Weight_'+weight_method]*df_MV["Ind_CAP_Weight"]
    #eq +short
    elif weight_method == 'eq_short_index':
        df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank'] = cal_3group_weight_eq(df_MV,factor,upper_pec,lower_pec,longbig,ind=False,short_index=True)
    #eq_ind + short
    elif weight_method == 'eq_ind_short_index':  
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor
                                                                         ,upper_pec,lower_pec,longbig,ind=True
                                                                         ,short_index=True)[0]) 
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor
                                                                         ,upper_pec,lower_pec,longbig,ind=True
                                                                         ,short_index=True)[1]) 
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True)/N_Ind_Select)
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    
    #做多指數區
    #tri + long
    elif weight_method == 'tri_long_index':
        df_MV[factor+'_Weight_'+weight_method],df_MV[factor+'_Rank'] = cal_3group_weight_tri(df_MV,factor
                                                                            ,upper_pec,lower_pec,longbig,long_index=True) 
    #tri_ind + long
    elif weight_method == 'tri_ind_long_index': 
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                            ,upper_pec,lower_pec,longbig,long_index=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                            ,upper_pec,lower_pec,longbig,long_index=True)[1])
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True)/N_Ind_Select)
        df_MV[factor+'_Rank']                  =  pd.Series(rank.reset_index(level=0, drop=True))
    #tri_ind_cap + long
    elif weight_method == 'tri_ind_cap_long_index':
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                           ,upper_pec,lower_pec,longbig,long_index=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                           ,upper_pec,lower_pec,longbig,long_index=True)[1]) 
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True))
        df_MV[factor+'_Rank']                  =  pd.Series(rank.reset_index(level=0, drop=True))
        df_MV[factor+'_Weight_'+weight_method] =  df_MV[factor+'_Weight_'+weight_method]*df_MV["Ind_CAP_Weight"]
    #eq + long
    elif weight_method == 'eq_long_index':
        df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank'] = cal_3group_weight_eq(df_MV,factor,upper_pec,lower_pec,longbig,ind=False,long_index=True) 
    #eq_ind + long  
    elif weight_method == 'eq_ind_long_index':   
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor
                                                                         ,upper_pec,lower_pec,longbig,ind=True
                                                                         , long_index=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor
                                                                         ,upper_pec,lower_pec,longbig,ind=True
                                                                         , long_index=True)[1])
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True)/N_Ind_Select)
        df_MV[factor+'_Rank']                  =  pd.Series(rank.reset_index(level=0, drop=True))
    
    ## 因子值加權
    #tri
    elif weight_method == 'tri_value':
        df_MV[factor+'_Weight_'+weight_method],df_MV[factor+'_Rank'] = cal_3group_weight_tri(df_MV,factor,upper_pec,lower_pec,longbig,value=True)
    #tri_ind
    elif weight_method == 'tri_ind_value': 
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                           ,upper_pec,lower_pec,longbig,value=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                           ,upper_pec,lower_pec,longbig,value=True)[1])   
        df_MV[factor+'_Weight_'+weight_method] = pd.Series(weight.reset_index(level=0, drop=True)/N_Ind_Select)
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    #tri_ind_cap    
    elif weight_method == 'tri_ind_cap_value':
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                            ,upper_pec,lower_pec,longbig,value=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                            ,upper_pec,lower_pec,longbig,value=True)[1])    
        df_MV[factor+'_Weight_'+weight_method] = pd.Series(weight.reset_index(level=0, drop=True))
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
        df_MV[factor+'_Weight_'+weight_method] = df_MV[factor+'_Weight_'+weight_method]*df_MV["Ind_CAP_Weight"]
    #eq
    elif weight_method == 'eq_value':
        df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank'] = cal_3group_weight_eq(df_MV,factor,upper_pec,lower_pec,longbig,ind=False,value=True)
    #eq_ind
    elif weight_method == 'eq_ind_value':
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor
                                                                                            ,upper_pec,lower_pec,longbig
                                                                                            ,ind=True,value=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor
                                                                                            ,upper_pec,lower_pec,longbig
                                                                                            ,ind=True,value=True)[1])
        df_MV[factor+'_Weight_'+weight_method] = pd.Series(weight.reset_index(level=0, drop=True))/N_Ind_Select  
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    
    #放空指數區
    #tri +short
    elif weight_method == 'tri_short_index_value':
        df_MV[factor+'_Weight_'+weight_method],df_MV[factor+'_Rank'] = cal_3group_weight_tri(df_MV,factor,upper_pec,lower_pec,longbig
                                                                                             ,short_index=True,value=True)
    #tri_ind +short
    elif weight_method == 'tri_ind_short_index_value' :
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,short_index=True,value=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,short_index=True,value=True)[1]) 
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True))/N_Ind_Select
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    #tri_ind_cap +short    
    elif weight_method == 'tri_ind_cap_short_index_value':
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,short_index=True,value=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,short_index=True,value=True)[1])  
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True))
        df_MV[factor+'_Rank']                  =  pd.Series(rank.reset_index(level=0, drop=True))
        df_MV[factor+'_Weight_'+weight_method] =  df_MV[factor+'_Weight_'+weight_method]*df_MV["Ind_CAP_Weight"]
    #eq + short
    elif weight_method == 'eq_short_index_value':
        df_MV[factor+'_Weight_'+weight_method],df_MV[factor+'_Rank'] = cal_3group_weight_eq(df_MV,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,ind=False,short_index=True,value=True)
    #eq_ind + short
    elif weight_method == 'eq_ind_short_index_value':  
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,ind=True,short_index=True,value=True)[0]) 
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,ind=True,short_index=True,value=True)[1]) 
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True))/N_Ind_Select 
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    
    #做多指數區
    #tri + long
    elif weight_method == 'tri_long_index_value':
        df_MV[factor+'_Weight_'+weight_method],df_MV[factor+'_Rank'] = cal_3group_weight_tri(df_MV,factor,upper_pec,lower_pec,longbig
                                                                                             ,long_index=True,value=True) 
    #tri_ind + long
    elif weight_method == 'tri_ind_long_index_value' :
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,long_index=True,value=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor,upper_pec,lower_pec,longbig
                                                                                             ,long_index=True,value=True)[1]) 
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True))/N_Ind_Select
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    #tri_ind_cap + long
    elif weight_method == 'tri_ind_cap_long_index_value':
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,long_index=True,value=True)[0]) 
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_tri(x,factor
                                                                                             ,upper_pec,lower_pec,longbig
                                                                                             ,long_index=True,value=True)[1])  
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True))
        df_MV[factor+'_Weight_'+weight_method] = df_MV[factor+'_Weight_'+weight_method]*df_MV["Ind_CAP_Weight"]
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
    #eq + long
    elif weight_method == 'eq_long_index_value':
        df_MV[factor+'_Weight_'+weight_method],df_MV[factor+'_Rank'] = cal_3group_weight_eq(df_MV,factor,upper_pec,lower_pec,longbig,
                                                                                              ind=False,long_index=True,value=True) 
    #eq_ind + long  
    elif weight_method == 'eq_ind_long_index_value':   
        weight = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor,upper_pec,lower_pec,longbig,
                                                                                              ind=True,long_index=True,value=True)[0])
        rank   = df_Ind.groupby('Ind',as_index=False).apply(lambda x : cal_3group_weight_eq(x,factor,upper_pec,lower_pec,longbig,
                                                                                              ind=True,long_index=True,value=True)[1]) 
        df_MV[factor+'_Weight_'+weight_method] =  pd.Series(weight.reset_index(level=0, drop=True))/N_Ind_Select 
        df_MV[factor+'_Rank']                  = pd.Series(rank.reset_index(level=0, drop=True))
 
    return df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank']

# settings

In [4]:
Factor_args = {}
#主要策略
Factor_args['weight_method'] = ['eq', 'tri', 'eq_ind', 'tri_ind', 'tri_ind_cap']
#考慮做多、放空指數
Factor_args['weight_method'] = Factor_args['weight_method']+[x+'_short_index' for x in Factor_args['weight_method']]+[x+'_long_index' for x in Factor_args['weight_method']]
#值加權
Factor_args['weight_method'] = Factor_args['weight_method']+[x+'_value' for x in Factor_args['weight_method']]
#因子
Main_Factor_Name = 'PE'
Main_Factor_Long = 0
periods = 1
#只取前後10%
Factor_args['upper_lower_perc_eq'] = {"upper" : 0.1, "lower" : 0.9}
#切一半
Factor_args['upper_lower_perc_tri'] = {"upper" : 0.5, "lower" : 0.5}
Factor_args[Main_Factor_Name] = { "factors" : [],
                                  "longbig" : [],
                                  "periods" : [] }

Factor_args[Main_Factor_Name]['factors'] = [Main_Factor_Name ]
#long bid = 0 -> #賣因子大的，買因子小的
Factor_args[Main_Factor_Name]['longbig'] = [Main_Factor_Long]*len(Factor_args[Main_Factor_Name]['factors']) 
Factor_args[Main_Factor_Name]['periods'] = [periods]

In [7]:
print(Factor_args)

{'weight_method': ['eq', 'tri', 'eq_ind', 'tri_ind', 'tri_ind_cap', 'eq_short_index', 'tri_short_index', 'eq_ind_short_index', 'tri_ind_short_index', 'tri_ind_cap_short_index', 'eq_long_index', 'tri_long_index', 'eq_ind_long_index', 'tri_ind_long_index', 'tri_ind_cap_long_index', 'eq_value', 'tri_value', 'eq_ind_value', 'tri_ind_value', 'tri_ind_cap_value', 'eq_short_index_value', 'tri_short_index_value', 'eq_ind_short_index_value', 'tri_ind_short_index_value', 'tri_ind_cap_short_index_value', 'eq_long_index_value', 'tri_long_index_value', 'eq_ind_long_index_value', 'tri_ind_long_index_value', 'tri_ind_cap_long_index_value'], 'upper_lower_perc_eq': {'upper': 0.1, 'lower': 0.9}, 'upper_lower_perc_tri': {'upper': 0.5, 'lower': 0.5}, 'PE': {'factors': ['PE'], 'longbig': [0], 'periods': [1]}}


# data

In [64]:
reader = pd.read_csv('./Data/output/df_merge_1_new.txt', chunksize=10 ** 6)
df_merge_final = pd.concat([x for x in reader], ignore_index=True)
#df_merge_final.count()

factor = Factor_args[Main_Factor_Name]['factors'][0]
periods = Factor_args[Main_Factor_Name]['periods'][0]

reader = pd.read_csv('./Data/output/df_merge_2.txt', chunksize=10 ** 6)
df_factor = pd.concat([x for x in reader], ignore_index=True)
df_merge_final = pd.merge(left=df_merge_final, right=df_factor[['Date','Code',factor]], on=['Date','Code'], how='left')
#df_merge_final.count()
del df_factor
gc.collect()

0

In [65]:
# shift 市值方便篩選成分股
df_merge_final['CAP_t-1'] = df_merge_final.groupby('Code')['CAP'].shift(1)
# main factor -> PE -> 計算T-1、T-2
factor_list = Factor_args[Main_Factor_Name]['factors']
for factor in factor_list :
    df_merge_final[factor+'_t-1'] = df_merge_final.groupby('Code')[factor].shift(1)
    df_merge_final[factor+'_t-2'] = df_merge_final.groupby('Code')[factor].shift(2)

# shift 收盤價方便計算投組損益
df_merge_final['Close_t+1']= df_merge_final.groupby('Code')['Close'].shift(-1)
# 調整開始時間 -> 2007/01/04
date_begin = np.array(df_merge_final['Date'].unique())[2]
df_merge_final = df_merge_final[df_merge_final['Date'] >= date_begin]
#以5000萬平均日成交金額為一篩選標準
df_merge_final['Volumn_a_select'] = (df_merge_final['Volumn_a_mean'] >= 50000000)*1
# 剔除Close , factor t-1 t-2 為na的資料   
df_merge_final = df_merge_final[pd.isna(df_merge_final['Close'])==False]
df_merge_final = df_merge_final[pd.isna(df_merge_final['CAP_t-1'])==False]
df_merge_final = df_merge_final[pd.isna(df_merge_final[factor+'_t-1'])==False]

In [66]:
first_date = np.array(df_merge_final.groupby('Year')['Date'].min())
#依照所有first date的資料，篩選出這些股票數據 -> True or False
df_Year = df_merge_final[[x in first_date for x in df_merge_final['Date']]]
df_Year = df_Year.sort_values(['Code','Year']).reset_index(drop=True)
temp    = df_Year[df_Year['Volumn_a_select']==1]
#count how many companies 平均日成交量 > 5000萬 every year
print(temp.groupby('Year')['Code'].count())

Year
2007    359
2008    515
2009    288
2010    393
2011    469
2012    342
2013    247
2014    312
2015    368
2016    322
2017    275
2018    377
2019    333
Name: Code, dtype: int64


In [67]:
#降序排名CAP(t-1)
#再選取CAP(t-1)前200大的
temp['MV_t-1_Rank'] = temp.groupby('Year')['CAP_t-1'].rank(ascending=False)
temp['MV_Select']   = (temp['MV_t-1_Rank'] < 201)*1
temp                = temp[['Code','Year','MV_Select','Volumn_a_select','CAP_t-1','Ind']]
temp_m              = temp[temp['MV_Select']==1] 
print(temp_m.groupby('Year')['Code'].count())

Year
2007    200
2008    200
2009    200
2010    200
2011    200
2012    200
2013    200
2014    201
2015    200
2016    200
2017    200
2018    200
2019    201
Name: Code, dtype: int64


In [68]:
# 標注每年200檔是值最大的個股，數量前10大產業
temp_m  = temp_m.sort_values(['Year','Ind','Code']).reset_index(drop=True)
Ind_select = temp_m.groupby('Year').apply(lambda x : select_top10_ind(x)).tolist()
Ind_select = [ x for y in Ind_select for x in y ]
temp_m['Ind_Select'] = Ind_select

In [69]:
# 各年度前10產業數量
temp_i= temp_m [temp_m.Ind_Select ==1]
print(temp_i.groupby('Year')['Code'].count())

Year
2007    155
2008    149
2009    143
2010    151
2011    147
2012    137
2013    144
2014    142
2015    145
2016    149
2017    155
2018    151
2019    148
Name: Code, dtype: int64


In [70]:
#計算10產業市值權重
#依照year、ind分類，去作加總 / #依照年分去作加總 -> 權重
Ind_Cap_Weight = temp_i.groupby(['Year','Ind'])['CAP_t-1'].sum() / temp_i.groupby(['Year'])['CAP_t-1'].sum()
temp_m["Ind_CAP_Weight"] = [ Ind_Cap_Weight[x][y] if z ==1 else 0 for x,y,z in 
                                 zip(temp_m["Year"],temp_m["Ind"],temp_m["Ind_Select"]) ]

In [71]:
df_merge_Year = pd.merge(left=df_merge_final, right=temp_m[['Year','Code','MV_Select','Ind_Select','Ind_CAP_Weight']]
                         , on=['Year','Code'], how='left')

del df_merge_final, temp_m
gc.collect()

120

In [72]:
df_merge_Year

Unnamed: 0,Code,Date,Close,Volumn,Volumn_a_mean,CAP,Year,Ind,大盤指數,Rm,PE,CAP_t-1,PE_t-1,PE_t-2,Close_t+1,Volumn_a_select,MV_Select,Ind_Select,Ind_CAP_Weight
0,1101,20070104,29.35,2318.0,262098608.0,922.1,2007,1.0,9071.09,0.002174,13.3,925.2,13.4,13.5,29.05,1,1.0,0.0,0.0
1,1101,20070105,29.05,7197.0,261089444.0,912.6,2007,1.0,8957.97,-0.012470,13.2,922.1,13.3,13.4,29.00,1,1.0,0.0,0.0
2,1101,20070108,29.00,6579.0,260653208.0,911.1,2007,1.0,8844.95,-0.012617,13.2,912.6,13.2,13.3,30.25,1,1.0,0.0,0.0
3,1101,20070109,30.25,10074.0,260225256.0,950.3,2007,1.0,8905.89,0.006890,13.7,911.1,13.2,13.2,29.75,1,1.0,0.0,0.0
4,1101,20070110,29.75,13399.0,259865000.0,934.6,2007,1.0,8801.28,-0.011746,13.5,950.3,13.7,13.2,29.05,1,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3797850,9962,20190506,12.05,85.0,2497884.0,10.9,2019,10.0,19441.43,-0.017950,12.1,10.9,12.1,12.1,12.00,0,,,
3797851,9962,20190507,12.00,32.0,2495608.0,10.8,2019,10.0,19602.03,0.008261,12.0,10.9,12.1,12.1,12.00,0,,,
3797852,9962,20190508,12.00,153.0,2501720.0,10.8,2019,10.0,19488.95,-0.005769,12.0,10.8,12.0,12.1,11.90,0,,,
3797853,9962,20190509,11.90,86.0,2503732.0,10.7,2019,10.0,19149.90,-0.017397,11.9,10.8,12.0,12.0,11.80,0,,,


# main strategy

In [73]:
import warnings
warnings.filterwarnings("ignore")

In [74]:
weight_method_list = Factor_args['weight_method']
factor_list = Factor_args[Main_Factor_Name]['factors']
longbig_list = Factor_args[Main_Factor_Name]['longbig']
Date_list = np.sort(df_merge_Year['Date'].unique())

factor  = factor_list[0]
longbig = longbig_list[0]

In [75]:
for k in range(len(weight_method_list)):
    weight_method   = weight_method_list[k]
    print(weight_method)
    initial_account = 1000000000
    df_Final = pd.DataFrame()
    need_col       = ['_Nstock_' ,'_Amount_','_Weight_']
    need_col       = [factor+x+weight_method for x in need_col]
    need_col_less1 = [x+'_t-1' for x in need_col]
    #progress = ProgressBar()
    for i in range(len(Date_list)-1):
        # day 1
        if i == 0:
            Date   = Date_list[i]
            df_MV  = df_merge_Year[df_merge_Year.Date == Date]
            df_MV  = df_MV[df_MV.MV_Select == 1]
            df_Ind = df_MV[df_MV.Ind_Select == 1]
            
            df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank'] = Cal_Weight(weight_method,df_MV,df_Ind,factor,longbig,Factor_args)
            df_MV = df_MV[['Code', 'Date','Close','Close_t+1',factor+'_t-1',factor+'_t-2',factor+'_Rank'
                           ,factor+'_Weight_'+weight_method,'Rm']]
            df_MV = df_MV[pd.isna(df_MV[factor+'_Weight_'+weight_method]) == False]
            #去掉權重為0的
            df_MV = df_MV[df_MV[factor+'_Weight_'+weight_method] != 0]
            df_MV[factor+'_Nstock_'+weight_method], df_MV[factor+'_Amount_'+weight_method] = Cal_Amount(df_MV,factor,weight_method)
            df_Final = df_MV
        # day 2 ~
        else:
            df_less1 = df_MV
            Date   = Date_list[i]
            df_MV  = df_merge_Year[df_merge_Year.Date == Date]
            df_MV  = df_MV[df_MV.MV_Select == 1]
            df_Ind = df_MV[df_MV.Ind_Select == 1]
            
            df_MV[factor+'_Weight_'+weight_method], df_MV[factor+'_Rank'] = Cal_Weight(weight_method,df_MV,df_Ind,factor,longbig,Factor_args)
            df_MV = df_MV[['Code', 'Date','Close','Close_t+1',factor+'_t-1',factor+'_t-2',factor+'_Rank'
                           ,factor+'_Weight_'+weight_method,'Rm']]
            df_MV = df_MV[pd.isna(df_MV[factor+'_Weight_'+weight_method]) == False]
            df_MV = df_MV[df_MV[factor+'_Weight_'+weight_method] != 0]
            df_MV[factor+'_Nstock_'+weight_method], df_MV[factor+'_Amount_'+weight_method] = Cal_Amount(df_MV,factor,weight_method)
            #將t存為t-1
            df_less1[need_col_less1+[factor+'_Rank_t-1','Close_t+1_less1']] = df_less1[need_col+[factor+'_Rank','Close_t+1']]
            df_MV_m = pd.merge(left=df_MV,right=df_less1[need_col_less1+['Code',factor+'_Rank_t-1','Close_t+1_less1']]
                               ,on=['Code'],how='outer')
            df_MV_m['Date'] = df_MV_m['Date'].fillna(method='ffill')
            df_MV_m['Close']= df_MV_m['Close'].fillna(df_MV_m['Close_t+1_less1'])
            df_MV_m = df_MV_m.fillna(0)
            #合併t、t-1
            df_Final = pd.concat([df_Final,df_MV_m],axis=0)
    if not os.path.exists('./Stats_Detail_New/multiway/factor'):
        os.makedirs('./Stats_Detail_New/multiway/factor')
    if not os.path.exists('./Weight/multiway/factor'):
        os.makedirs('./Weight/multiway/factor')     
    #output    
    df_Final.to_csv('./Weight/multiway/factor/'+factor+'_Weight_'+weight_method+'.txt',index=False)
    
    #performance
    df_performance = df_Final.reset_index(drop=True)
    
    df_Equity_Trunover_Tax = pd.DataFrame()
    df_Equity_Trunover_Tax[['Date','Profit_'+weight_method]] = df_performance.groupby('Date').apply(lambda x : 
                                                                            Cal_Profit(x,factor ,weight_method)).reset_index()
    df_Equity_Trunover_Tax['Tax_'+weight_method] = df_performance.groupby('Date').apply(lambda x : 
                                                                            Cal_Cost(x,factor ,weight_method)).reset_index(drop=True)
    df_Equity_Trunover_Tax['Turnover_'+weight_method] = df_performance.groupby('Date').apply(lambda x : 
                                                                            Cal_Turnover(x,factor ,weight_method)).reset_index(drop=True)
    df_Equity_Trunover_Tax['Equity_'+weight_method] = initial_account + df_Equity_Trunover_Tax['Profit_'+weight_method].cumsum() - df_Equity_Trunover_Tax['Tax_'+weight_method].cumsum()
    df_Equity_Trunover_Tax.to_csv('./Stats_Detail_New/multiway/factor/'+factor+'_Stats_Detail_New_'+weight_method+'.txt',index=False)
    gc.collect()
    

eq
tri
eq_ind
tri_ind
tri_ind_cap
eq_short_index
tri_short_index
eq_ind_short_index
tri_ind_short_index
tri_ind_cap_short_index
eq_long_index
tri_long_index
eq_ind_long_index
tri_ind_long_index
tri_ind_cap_long_index
eq_value
tri_value
eq_ind_value
tri_ind_value
tri_ind_cap_value
eq_short_index_value
tri_short_index_value
eq_ind_short_index_value
tri_ind_short_index_value
tri_ind_cap_short_index_value
eq_long_index_value
tri_long_index_value
eq_ind_long_index_value
tri_ind_long_index_value
tri_ind_cap_long_index_value
