In [1]:
import os
import random
import time
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed

import torch
import torch.nn as nn
import random
import seaborn as sns
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import lightgbm as lgb
from lightgbm import LGBMRegressor

import akshare as ak

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 350)

## 原始数据预处理

In [None]:
# 每只股票的基础量价特征会保存在这里（如何获取数据及预处理详见factor_generate部分）
df = pd.read_pickle('Factor_final.pkl')
df = df.dropna(how='any',axis=0).reset_index(drop=True)
df = df[df['instrument'].isin(ticker_list)]

# 将全体样本根据根据股票代码进行拆分储存
def iter_ticker_sample(one_tick):
    lookback = 10
    if len(one_tick) <= 2*lookback:
        pass
    else:
        # one_tick = df[df['instrument']==ticker]
        ticker = np.unique(one_tick['instrument'])[0]
        one_tick = one_tick.sort_values(by='date')
        head = one_tick[['date','instrument','label','label_raw','group']]
        number = one_tick.drop(columns=['date','instrument','label','label_raw','group'])

        def iter_flatten_sample(one_window):
            return pd.DataFrame(one_window.values.flatten(),
                        columns = [one_window.index[-1]]).T
        col_base = number.columns.tolist()
        col_new = []
        for i in range(lookback):
            col_new += [col+ f'_{-9+i}' for col in col_base]
        len(col_new)

        flatten_list = []
        for i in range(lookback,len(one_tick)):
            flatten_list.append(iter_flatten_sample(number.iloc[i-lookback:i,:]))
        flatten_df = pd.concat(flatten_list,axis=0)
        flatten_df.columns = col_new
        flatten_df = pd.concat([head,flatten_df],axis=1).dropna(how='any',axis=0)
        # flatten_df = pd.concat([flatten_df[['date','instrument']],flatten_df.drop(columns=['date','instrument']).astype('float32')],axis=1)
        flatten_df.to_pickle(f'D:\jupyter notebook\graduate\data_byticker\{ticker}.pkl')
        # flatten_df.to_parquet(f'D:\jupyter notebook\graduate\data_byticker\{ticker}.parquet')
    return 


group = []
for i in df.groupby(by='instrument'):
    group.append(i[1]) 
ticker_sample = Parallel(n_jobs=16,verbose=1)(delayed(iter_ticker_sample)(one_tick) for one_tick in group)

file_all = os.listdir('D:\jupyter notebook\graduate\data_byticker')
ticker_list = [filename.replace('.pkl','') for filename in file_all]

# 再将样本根据日期重新排列
file_all = os.listdir('data_bydate/')
filename = file_all[1]
for filename in tqdm(file_all):
    date = filename.replace('.pkl','')
    df1 = pd.read_pickle(f'data_bydate/{filename}')
    # df1 = pd.read_pickle('data_bydate/20100322.pkl')
    df1 = df1.dropna(how='any',axis=0)
    df1['label'] = df1['label_raw'].rank(pct=True)
    feature = df1.iloc[:,5:]
    n_sample = feature.values.shape[0]
    lookback = 10
    n_ft = 126
    feature = feature.values.reshape(n_sample,lookback,n_ft)
    label = df1['label'].values.reshape(n_sample,1)
    label_raw = df1['label_raw'].values.reshape(n_sample,1)
    index = df1['instrument'].values.reshape(n_sample,1)
    # save_path
    np.save(f'D:/jupyter notebook/graduate/all_sample/feature/{date}.npy',feature)
    np.save(f'D:/jupyter notebook/graduate/all_sample/label/{date}.npy',label)
    np.save(f'D:/jupyter notebook/graduate/all_sample/index/{date}.npy',index)
    np.save(f'D:/jupyter notebook/graduate/all_sample/label_raw/{date}.npy',label_raw)

## 时序神经网络模型设计与训练

In [None]:
# 截取样本内
date_list = os.listdir('data_bydate')
date_list = [d.replace('.pkl','') for d in date_list]
date_list.sort()

end_year = 2021 # 选择样本内停止的年份，并固定会选取过去三年长度截取
insample_end = str(datetime.date(end_year,1,1)).replace('-','')
insample_start = str(datetime.date(end_year-3,1,1)).replace('-','')

date_series = pd.Series(date_list)
insample_date = date_series[(date_series > insample_start) & (date_series < insample_end)].tolist()
train_date = insample_date[:-120] 
val_date = insample_date[-120:]

X_train = [np.load(f'all_sample/feature/{date}.npy') for date in train_date]
X_train = np.concatenate(X_train,axis=0)

y_train = [np.load(f'all_sample/label/{date}.npy') for date in train_date]
y_train = np.concatenate(y_train,axis=0)

batch_list = [np.load(f'all_sample/feature/{date}.npy').shape[0] for date in train_date]
batch_size = int(np.mean(batch_list))
print(batch_size)

print(X_train.shape)

In [None]:
# 神经网络部分
class MyDataset(Dataset):
    def __init__(self, features, targets):
        super().__init__()
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        x = torch.as_tensor(self.features[idx,:,:], dtype=torch.float32)   
        y = torch.as_tensor(self.targets[idx], dtype=torch.float32)  
        return x, y

train_dataset = MyDataset(np.clip(X_train,-5,5),y_train)
train_loader = DataLoader(train_dataset, batch_size, shuffle=False,num_workers=0,pin_memory=True)

# 自定义网络结构，但核心在于必须有hookout提取中间层输出（以LSTM为例）
class LSTM_Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        input_size, hidden_size, dropout = config["input_size"], config["hidden_size"], config["dropout"] 
        num_layers = config["num_layers"]
        # hook out
        self.hook_output = None

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # input shape: [batch * seq_len * d_model]
        self.encode = nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.ReLU(inplace=True),
            # nn.BatchNorm2d(hidden_size,eps=1e-05, momentum=0.1, affine=True)
        )
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.batchn = nn.BatchNorm1d(hidden_size)
        self.dropout = nn.Dropout(p=dropout)
        self.linear = nn.Linear(hidden_size,1)
        self.sigmoid = nn.Sigmoid()

        self._device = config["device"]

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x):
        x = self.encode(x)
        lstm_out = self.lstm(x)[0]
        # detach to perform truncated bptt
        # choose the latest layer
        lstm_out = lstm_out[:,-1,:]
        lstm_out = self.batchn(lstm_out)
        lstm_out = self.dropout(lstm_out)
        self.hook_output = lstm_out
        # drop_out = self.dropout(lstm_out)
        final_out = self.linear(lstm_out)
        final_out = self.sigmoid(final_out)
 
        return final_out
    

# 定义一个可以设置随机种子的函数
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
 
# 设置随机数种子
setup_seed(1)

# 设置训练参数
device = ("cuda:0" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
print(device)

params = dict(
    input_size = X_train.shape[-1],
    hidden_size = 32,
    dropout = 0.3,
    num_layers = 1,
    lr= 1e-4,
    device=device,
    output_size = y_train.shape[1]
)

lr = 1e-4
train_end = None
params

In [None]:
# 打印网络结构
def model_structure(model):
    blank = ' '
    print('-' * 90)
    print('|' + ' ' * 11 + 'weight name' + ' ' * 10 + '|' \
          + ' ' * 15 + 'weight shape' + ' ' * 15 + '|' \
          + ' ' * 3 + 'number' + ' ' * 3 + '|')
    print('-' * 90)
    num_para = 0
    type_size = 1  # 如果是浮点数就是4
    outputlist = []
    for index, (key, w_variable) in enumerate(model.named_parameters()):
        if len(key) <= 30:
            key = key + (30 - len(key)) * blank
        shape = str(w_variable.shape)
        if len(shape) <= 40:
            shape = shape + (40 - len(shape)) * blank
        each_para = 1
        for k in w_variable.shape:
            each_para *= k
        num_para += each_para
        str_num = str(each_para)
        if len(str_num) <= 10:
            str_num = str_num + (10 - len(str_num)) * blank

        print('| {} | {} | {} |'.format(key, shape, str_num))
        outputlist.append([key, shape, str_num])
    print('-' * 90)
    print('The total number of parameters: ' + str(num_para))
    print('The parameters of Model {}: {:4f}M'.format(model._get_name(), num_para * type_size / 1000 / 1000))
    print('-' * 90)
    return outputlist

my_lstm = LSTM_Model(params)
structure_list = model_structure(my_lstm)
latex_txt = pd.DataFrame(structure_list,
             columns = ['weight name', 'weight shape', 'params number']).to_latex()
print(latex_txt)

In [None]:
# 开始训练
my_lstm = LSTM_Model(params).to(device)
optimizer = torch.optim.Adam(my_lstm.parameters(),lr=lr)
epoch = 20
# loss_func = nn.MSELoss()

def ic_loss(x,y,hook_output):
    ft_size = hook_output.size()[-1]
    rtn_corr = torch.corrcoef(torch.concat([x,y],axis=1).T)[0][1]
    ft_corr = (torch.corrcoef(hook_output.T).abs().sum() - ft_size) / (ft_size * (ft_size-1))
    return -rtn_corr + 0.1 * ft_corr 

loss_func = ic_loss

for step in tqdm(range(epoch)):
    print("\n{:-^120s}".format(f'epoch = {step}'))
    loss_all = []
    for tx, ty in train_loader:
        # my_lstm = torch.load('best_model.pt', map_location=torch.device(device))
        tx = tx.to(device)
        ty = ty.to(device)

        output = my_lstm(tx).to(device)
        hook_output = my_lstm.hook_output
        loss = loss_func(output,ty, hook_output)
        # print(loss.item)
        loss_all.append(loss.item())
        # loss = ic_loss(output,ty)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(np.mean(loss_all))
    torch.save(my_lstm,f'model_save/model_{insample_start}_{insample_end}_epoch{step}.pt')
    my_lstm = my_lstm.to(device)


In [None]:
# 提取训练结果
def iter_predict_get(date):
    device = ("cuda:0" if torch.cuda.is_available() else "cpu")
    X_test = np.load(f'all_sample/feature/{date}.npy')
    # y_test = np.load(f'all_sample/label/{date}.npy')
    y_pred_list = []
    for step in range(epoch):
        # my_lstm_predict = LSTM_Model(params_predict)
        my_lstm = torch.load(f'model_save/model_{insample_start}_{insample_end}_epoch{step}.pt').to(device)
        # my_lstm_predict.load_state_dict(my_lstm.state_dict())
        
        y_pred = my_lstm(torch.as_tensor(X_test, dtype=torch.float32).to(device)).detach().cpu().numpy()
        # my_lstm.hookout
        y_pred_list.append(y_pred)
    y_pred_all = np.concatenate(y_pred_list,axis=1)
    torch.cuda.empty_cache()
    return y_pred_all

def iter_hookout_get(date,step):
    device = ("cuda:0" if torch.cuda.is_available() else "cpu")
    X_test = np.load(f'all_sample/feature/{date}.npy')

    my_lstm = torch.load(f'model_save/model_{insample_start}_{insample_end}_epoch{step}.pt').to(device)

    my_lstm(torch.as_tensor(X_test, dtype=torch.float32).to(device)).detach().cpu().numpy()
    hook_output = my_lstm.hook_output.detach().cpu().numpy()
    torch.cuda.empty_cache()
    return hook_output

def iter_predict_get_one(date,step):
    device = ("cuda:0" if torch.cuda.is_available() else "cpu")
    X_test = np.load(f'all_sample/feature/{date}.npy')

    my_lstm = torch.load(f'model_save/model_{insample_start}_{insample_end}_epoch{step}.pt').to(device)
    y_pred = my_lstm(torch.as_tensor(X_test, dtype=torch.float32).to(device)).detach().cpu().numpy()
    torch.cuda.empty_cache()
    return y_pred

epoch = 20
end_year = 2021
# 指定日期信息
insample_end = str(datetime.date(end_year,1,1)).replace('-','')
insample_start = str(datetime.date(end_year-3,1,1)).replace('-','')
outsample_end = str(datetime.date(end_year+1,1,1)).replace('-','')

date_series = pd.Series(date_list)
insample_date = date_series[(date_series > insample_start) & (date_series < insample_end)].tolist()
outsample_date = date_series[(date_series > insample_end) & (date_series < outsample_end)].tolist()

train_date = insample_date[:-120] 
val_date = insample_date[-120:]
# val_date = val_date + outsample_date
val_date.sort()
out_date = val_date + outsample_date
out_date.sort()

# 提取预测值和中间层输出
res = [iter_predict_get_one(date, 12) for date in tqdm(out_date)]
hookout_res = [iter_hookout_get(date, 12) for date in tqdm(out_date)]

y_train = [np.load(f'all_sample/label_raw/{date}.npy') for date in train_date]
y_train = np.concatenate(y_train,axis=0)

y_val = [np.load(f'all_sample/label_raw/{date}.npy') for date in val_date]
y_val = np.concatenate(y_val,axis=0)

y_pred_df = pd.DataFrame(np.concatenate(res,axis=0))

y_train_pred = y_pred_df.head(y_train.shape[0])
y_train_pred['base'] = y_train

y_val_pred = y_pred_df.tail(y_val.shape[0])
y_val_pred['base'] = y_val


In [None]:
# 分析不同epoch训练效果
from matplotlib.ticker import MaxNLocator 
train_ic = y_train_pred.corr(method='spearman')['base']
val_ic = y_val_pred.corr(method='spearman')['base']

ic_compare = pd.concat([train_ic,val_ic],axis=1).iloc[:-1,:]
ic_compare.columns = ['train','val']
ic_compare.plot(figsize=(10,5))
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.grid()

In [None]:
# 观察中间层输出分布
hookout_res = [iter_hookout_get(date, 17) for date in tqdm(insample_date)]

hookout_df = pd.DataFrame(np.concatenate(hookout_res,axis=0))
ft_size = hookout_df.shape[-1]
hookout_tmp = hookout_df.head(y_train.shape[0])
hookout_tmp.replace(0.,np.nan).hist(figsize=(20,20),bins=50)
plt.show()

hookout_tmp = hookout_df.tail(y_val.shape[0])
hookout_tmp.replace(0.,np.nan).hist(figsize=(20,20),bins=50)
plt.show()

In [None]:
# 统计中间层输出效果
# 与预测目标之间的IC
train_hookout = hookout_df.head(y_train.shape[0])
train_hookout['label'] = y_train
train_ft_ic = train_hookout.corr('spearman')['label'].iloc[:-1]
train_ft_ic.plot(kind='bar',figsize=(10,5))
plt.grid()
plt.show()

# 中间层自身的相关性矩阵
train_ft_corr = hookout_df.head(y_train.shape[0]).corr()
print((train_ft_corr.abs().values.sum() - ft_size) / (ft_size * (ft_size -1)))

f = plt.figure(figsize=(20,20))
f.add_subplot(111)
sns.heatmap(train_ft_corr,annot=True,square=True,cmap='RdBu_r',linewidths=0.3,vmax=1,vmin=-1,fmt='.2f',cbar=False)
plt.show()

In [None]:
# 将中间层结果保存以备lgm处理
idx_full_list = []
for date in out_date:
    ticker_list = np.load(f'all_sample/index/{date}.npy',allow_pickle=True)
    idx_full = []
    for i in ticker_list:
        idx_full.append([date,i[0]])
    idx_full_list.append(np.array(idx_full))
idx_full = np.concatenate(idx_full_list,axis=0)

y_test = [np.load(f'all_sample/label/{date}.npy') for date in out_date]
y_test = np.concatenate(y_test,axis=0)

rtn_test = [np.load(f'all_sample/label_raw/{date}.npy') for date in out_date]
rtn_test = np.concatenate(rtn_test,axis=0)

y_pred_df = pd.DataFrame(np.concatenate(res,axis=0),columns=['nn_pred'])
y_pred_df[['date','id']] = idx_full
y_pred_df['label'] = y_test
y_pred_df['label_raw'] = rtn_test
hookout_df = pd.DataFrame(np.concatenate(hookout_res,axis=0))
df = pd.concat([y_pred_df,hookout_df],axis=1)
# df.to_pickle(f'lgbm_data/{out_date[0]}_{out_date[-1]}_trans.pkl')
df

## 集成学习模型LGBM再加工

In [None]:
# 读取特定区间的中间层输出
df = pd.read_pickle('lgbm_data/20120710_20131231.pkl')
df

In [None]:
# 划分lgm模型的样本内外
insample_len = 120
step = 10
ft_num = 32

date_list = df['date'].unique().tolist()
date_list.sort()

move_group = (len(date_list) - insample_len) // step + 2
start_list = []
for g in range(move_group):
    i = g * step
    start_list.append(i)
    if i + insample_len + step >= len(date_list):
        break

print(start_list)


In [None]:
# lgm模型设计与训练
params = {
    'objective': 'rmse',  
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'verbose':-1,
    'bagging_fraction':0.8,
    'num_leaves':10,
    'seed':2,
    # 'learning_rate'
}
step = 20
res_list = []
out_sample_all = []
for start in start_list:
    train_date = date_list[start:start+insample_len] 
    val_date = train_date[-20:]
    train_date = train_date[:-20]
    test_date = date_list[date_list.index(train_date[-1]) + 1: date_list.index(train_date[-1]) + 1 + step]
    print(len(train_date), len(val_date), len(test_date))
    print(test_date)

    train_df = df[df['date'].isin(train_date)]
    val_df = df[df['date'].isin(val_date)]
    test_df = df[df['date'].isin(test_date)]

    x_train = train_df[[i for i in range(ft_num)]].values
    y_train = train_df['label'].values
    # x_train = torch.sigmoid(torch.tensor(x_train)).numpy()

    x_val = val_df[[i for i in range(ft_num)]].values
    y_val = val_df['label'].values
    # x_val = torch.sigmoid(torch.tensor(x_val)).numpy()

    x_test = test_df[[i for i in range(ft_num)]].values
    y_test = test_df['label'].values
    # x_test = torch.sigmoid(torch.tensor(x_test)).numpy()

    def correlation(a, train_data):
        
        b = train_data.get_label()
        
        a = np.ravel(a)
        b = np.ravel(b)

        len_data = len(a)
        mean_a = np.sum(a) / len_data
        mean_b = np.sum(b) / len_data
        var_a = np.sum(np.square(a - mean_a)) / len_data
        var_b = np.sum(np.square(b - mean_b)) / len_data

        cov = np.sum((a * b))/len_data - mean_a*mean_b
        corr = cov / np.sqrt(var_a * var_b)

        return 'corr', corr, True

    train_dataset = lgb.Dataset(x_train, y_train)
    val_dataset = lgb.Dataset(x_val, y_val)
    model = lgb.train(params = params, 
                        train_set = train_dataset, 
                        valid_sets = [val_dataset], 
                        num_boost_round = 50, 
                        early_stopping_rounds = 5, 
                        
                        verbose_eval = False,
                        feval = correlation)

    y_predict = model.predict(x_test)
    test_df['lgm_pred'] = y_predict
    out_sample_all.append(test_df)
    res_tmp = test_df[['nn_pred','lgm_pred','label_raw']].corr('spearman')
    res_list.append(res_tmp.tail(1))
    display(res_tmp)
# latex_txt = pd.concat(res_list,axis=0).reset_index(drop=True).head(10).drop(columns=['label_raw']).T.to_latex()
# print(latex_txt)

In [None]:
# 统计lgm模型给出的特征重要性
params = {
    'objective': 'rmse',  
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'verbose':-1,
    'bagging_fraction':0.8,
    'num_leaves':10,
    'seed':2,
    # 'learning_rate'
}

def correlation(a, train_data):
    b = train_data.get_label()
    a = np.ravel(a)
    b = np.ravel(b)
    len_data = len(a)
    mean_a = np.sum(a) / len_data
    mean_b = np.sum(b) / len_data
    var_a = np.sum(np.square(a - mean_a)) / len_data
    var_b = np.sum(np.square(b - mean_b)) / len_data

    cov = np.sum((a * b))/len_data - mean_a*mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'corr', corr, True

train_dataset = lgb.Dataset(x_train, y_train)
val_dataset = lgb.Dataset(x_val, y_val)
model = lgb.train(params = params, 
                    train_set = train_dataset, 
                    valid_sets = [val_dataset], 
                    num_boost_round = 50, 
                    early_stopping_rounds = 5, 
                    
                    verbose_eval = False,
                    feval = correlation)

ft_importance = pd.DataFrame([model.feature_importance(importance_type='split')
                             ,model.feature_importance(importance_type='gain')]).T
ft_importance.columns = ['split','gain']
ft_importance

ic_importance = train_df.drop(columns=['nn_pred','date','id','label']).corr('spearman')['label_raw'].iloc[1:]
ft_importance['ic'] = ic_importance.abs().values

ft_importance.rank(axis=0).plot(kind='bar',figsize=(10,5))
plt.grid()
plt.show()

In [None]:
# 对比基准模型与lgm模型之间的预测效果
full_testdf = pd.concat(out_sample_all,axis=0)

test_nn = full_testdf.pivot_table(index='date',columns='id',values='nn_pred')
test_lgm = full_testdf.pivot_table(index='date',columns='id',values='lgm_pred')

test_rtn = full_testdf.pivot_table(index='date',columns='id',values='label_raw')
test_nn_ic = test_rtn.corrwith(test_nn,axis=1,method='spearman')
test_nn_ic.index = pd.to_datetime(test_nn_ic.index)
test_nn_ic.plot()
plt.grid()
plt.show()

test_lgm_ic = test_rtn.corrwith(test_lgm,axis=1,method='spearman')
test_lgm_ic.index = pd.to_datetime(test_lgm_ic.index)
test_lgm_ic.plot()
plt.grid()
plt.show()

test_nn_ic.mean() / test_nn_ic.std()

compare_ic = pd.concat([test_lgm_ic,test_nn_ic],axis=1)
compare_ic = compare_ic.loc['20130101':,:]
compare_ic.columns = ['lgbm','nn']
compare_ic.plot(figsize=(10,5)).fill_between(compare_ic.index, 0.03, -0.03,color='yellow',alpha=0.4)
plt.axhline(0.03,color='black',linestyle='--')
plt.axhline(-0.03,color='black',linestyle='--')
plt.show()


## 策略设计与回测

In [None]:
# 获取基准指数信息
sh01 = ak.stock_zh_index_daily(symbol="sh000001")
sz02 = ak.stock_zh_index_daily(symbol="sz000002")
sh01['amount'] = sh01['close'] * sh01['volume']
sz02['amount'] = sz02['close'] * sz02['volume']

sh01.index = sh01.date
sz02.index = sz02.date
sh01['rtn'] = sh01['open'].pct_change(1).shift(-2)
sz02['rtn'] = sz02['open'].pct_change(1).shift(-2)

lack = sh01.loc[sh01.index.difference(sz02.index),:]
sz02 = pd.concat([sz02,lack],axis=0).sort_index()

full_market_rtn = (sz02['amount'] * sz02['rtn'] + sh01['amount'] * sh01['rtn']) / (sh01['amount'] + sz02['amount'])
full_market_rtn = full_market_rtn.to_frame('full_rtn')
full_market_rtn.index = full_market_rtn.index.astype('str').map(lambda x: x.replace('-',''))

# 回测
from FactorAnalysis import * 
df_factor = test_lgm.loc['20130101':,:]
df_rtn = test_rtn.loc['20130101':,:]
benchmark = full_market_rtn.loc[df_rtn.index,:]
benchmark.index = pd.to_datetime(benchmark.index)
fa_sub = FactorAnalysis(df_factor,df_rtn,[1],10,benchmark=benchmark,ret_kind='sum',cost=0.0008)
fa_sub.fast_analysis()