Define 4 PSO(Particle Swarm Optimization) algorithms to adjust the parameter

1. RF_PSO (Random Forest)，ADA_PSO (Adaboost)，GBDT_PSO output: the sequence of predicted returns of stocks in each time cross-section
2. Rolling training --> sec is the ranking of stock returns

Delete features that are missing more than 5% at each time cross-section, and then delete samples with missing values

The importance of 209 candidate factors is obtained by Decision Tree and selected according to the importance.

1. The number of features in different time cross-sections is different. To prevent over-fitting, when building a predictive model with three models for each time cross-section, a maximum of 10 features can be used.
2. Feature selection and modeling of three prediction models are done on 121 time cross-sections.
3. Add up the feature importance of the selected feature of each time cross-section: "特征重要度汇总.xls"

# tree_PSO
    用于特征筛选时，调参决策树

In [159]:
# ----------------------PSO参数设置---------------------------------
class tree_PSO():
    def __init__(self, pN, dim, max_iter, train_x, train_y):
        self.w = 0.8    #设置较大的W和较小的C避免负参数
        self.c1 = 2
        self.c2 = 2
        self.r1 = 0.6
        self.r2 = 0.3
        self.pN = pN  # 粒子数量
        self.dim = dim  # 搜索维度
        self.max_iter = max_iter  # 迭代次数
        self.train_x = train_x
        self.train_y = train_y
        self.X = np.zeros((self.pN, self.dim))  # 所有粒子的位置和速度
        self.V = np.zeros((self.pN, self.dim))
        self.pbest = np.zeros((self.pN, self.dim))  # 个体经历的最佳位置和全局最佳位置
        self.gbest = np.zeros((1, self.dim))
        self.p_fit = np.zeros(self.pN)  # 每个个体的历史最佳适应值
        self.fit = 0.6# 全局最佳适应值

# ---------------------目标函数Sphere函数-----------------------------
    def function(self, X, train_x, train_y):
        # 适应度函数为model的CV3平均值
        Model = tree.DecisionTreeRegressor(
                                        criterion='mse',
                                        splitter='best',
                                        max_depth=X[0],
                                        min_samples_split=X[1],
                                        min_samples_leaf=X[2],
                                        min_weight_fraction_leaf=0.0,
                                        max_features=None,
                                        random_state=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        ccp_alpha=0.0,
                                    )
        
        cv_scores = model_selection.cross_validate(Model,
                                                   x,
                                                   y,
                                                   groups=None,
                                                   scoring="r2",  #适应度函数
                                                   cv=3,
                                                   verbose=0,
                                                   fit_params=None,
                                                   pre_dispatch='2*n_jobs',
                                                   return_train_score=False,
                                                   return_estimator=False,
                                                   )
                                                       
        return cv_scores["test_score"].mean()  #3个交叉验证的均值

    
    

    # ---------------------初始化种群----------------------------------
    def init_Population(self):
        for i in range(self.pN):
            
            self.X[i][0] = random.randint(2,12)  
            self.V[i][0] = random.randint(1,3)  
            
            self.X[i][1] = random.randint(10,50)
            self.V[i][1] = random.randint(1,5)
            
            self.X[i][2] = random.randint(5,30)
            self.V[i][2] = random.randint(1,3)
            
            self.pbest[i] = self.X[i]
            tmp = self.function(self.X[i],self.train_x,self.train_y)
            self.p_fit[i] = tmp
            if tmp > self.fit:
                self.fit = tmp
                self.gbest = self.X[i]

                # ----------------------更新粒子位置----------------------------------

    def iterator(self):
        fitness = []
        for t in range(self.max_iter):
            for i in range(self.pN):  # 更新gbest\pbest
                temp = self.function(self.X[i],self.train_x,self.train_y)
                if temp > self.p_fit[i]:  # 更新个体最优
                    self.p_fit[i] = temp
                    self.pbest[i] = self.X[i]
                    if self.p_fit[i] > self.fit:  # 更新全局最优
                        self.gbest = self.X[i]
                        self.fit = self.p_fit[i]
            for i in range(self.pN):
                self.V[i] = self.w * self.V[i] + self.c1 * self.r1 * (self.pbest[i] - self.X[i]) + \
                            self.c2 * self.r2 * (self.gbest - self.X[i])
                self.X[i] = self.X[i] + self.V[i]
                
                #约束参数数据类型
                self.X[i][0] = int(self.X[i][0])
                self.X[i][1] = int(self.X[i][1])
                self.X[i][2] = int(self.X[i][2])
                
                #约束参数最小值
                if self.X[i][0] < 2:
                    self.X[i][0] = 2
                if self.X[i][1] < 2:
                    self.X[i][1] = 1
                if self.X[i][2] < 1:
                    self.X[i][2] = 1
                
            fitness.append(self.fit)
            #print(self.X[0], end=" ")
            #print(self.fit)  # 输出最优值
        return fitness,self.X[0]


def factor_select(x,y):


    my_pso = tree_PSO(pN=20, dim=3, max_iter=10, train_x=x,train_y=y)
    my_pso.init_Population()
    fitness,params = my_pso.iterator()

    Model = tree.DecisionTreeRegressor(
                        criterion='mse',
                        splitter='best',
                        max_depth=int(params[0]),
                        min_samples_split=int(params[1]),
                        min_samples_leaf=int(params[2]),
                        min_weight_fraction_leaf=0.0,
                        max_features=None,
                        random_state=None,
                        max_leaf_nodes=None,
                        min_impurity_decrease=0.0,
                        min_impurity_split=None,
                        ccp_alpha=0.0,
                    )
    Model.fit(x,y)

    fdf = pd.DataFrame()
    fdf['feature'] = xdf.columns
    fdf['importances'] = Model.feature_importances_
    fdf = fdf.query('importances>0')
    fdf = fdf.sort_values(by='importances',ascending=False)
    fdf.index = range(len(fdf))
    fdf = fdf[:10]
    factor = list(fdf['feature'])
    return fdf

# RF_PSO
    随机森林调参

In [160]:
# ----------------------PSO参数设置---------------------------------
class RF_PSO():
    def __init__(self, pN, dim, max_iter, train_x, train_y):
        self.w = 0.8    #设置较大的W和较小的C避免负参数
        self.c1 = 2
        self.c2 = 2
        self.r1 = 0.6
        self.r2 = 0.3
        self.pN = pN  # 粒子数量
        self.dim = dim  # 搜索维度
        self.max_iter = max_iter  # 迭代次数
        self.train_x = train_x
        self.train_y = train_y
        self.X = np.zeros((self.pN, self.dim))  # 所有粒子的位置和速度
        self.V = np.zeros((self.pN, self.dim))
        self.pbest = np.zeros((self.pN, self.dim))  # 个体经历的最佳位置和全局最佳位置
        self.gbest = np.zeros((1, self.dim))
        self.p_fit = np.zeros(self.pN)  # 每个个体的历史最佳适应值
        self.fit = 0.6# 全局最佳适应值

    # ---------------------目标函数Sphere函数-----------------------------
    def function(self, X, train_x, train_y):
        # 适应度函数为model的CV3平均值
        Model = ensemble.RandomForestRegressor(
                                    n_estimators=X[0],
                                    criterion='mse',
                                    max_depth=X[1],
                                    min_samples_split=X[2],
                                    min_samples_leaf=X[3],
                                    min_weight_fraction_leaf=0.0,
                                    max_features='auto',
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0.0,
                                    min_impurity_split=None,
                                    bootstrap=True,
                                    oob_score=False,
                                    n_jobs=None,
                                    random_state=None,
                                    verbose=0,
                                    warm_start=False,
                                    ccp_alpha=0.0,
                                    max_samples=None,
                                )
        cv_scores = model_selection.cross_validate(Model,
                                                   x,
                                                   y,
                                                   groups=None,
                                                   scoring="r2",  #适应度函数
                                                   cv=3,
                                                   verbose=0,
                                                   fit_params=None,
                                                   pre_dispatch='2*n_jobs',
                                                   return_train_score=False,
                                                   return_estimator=False,
                                                   )
                                                       
        return cv_scores["test_score"].mean()  #5个交叉验证的均值

    
    

    # ---------------------初始化种群----------------------------------
    def init_Population(self):
        for i in range(self.pN):
            
            self.X[i][0] = random.randint(50,150)  
            self.V[i][0] = random.randint(10,20)  
            
            self.X[i][1] = random.randint(1,10)
            self.V[i][1] = random.randint(1,3)
            
            self.X[i][2] = random.randint(5,50)
            self.V[i][2] = random.randint(1,5)
            
            self.X[i][3] = random.randint(5,50)
            self.V[i][3] = random.randint(1,5)
            
            self.pbest[i] = self.X[i]
            tmp = self.function(self.X[i],self.train_x,self.train_y)
            self.p_fit[i] = tmp
            if tmp > self.fit:
                self.fit = tmp
                self.gbest = self.X[i]

                # ----------------------更新粒子位置----------------------------------

    def iterator(self):
        fitness = []
        for t in range(self.max_iter):
            for i in range(self.pN):  # 更新gbest\pbest
                temp = self.function(self.X[i],self.train_x,self.train_y)
                if temp > self.p_fit[i]:  # 更新个体最优
                    self.p_fit[i] = temp
                    self.pbest[i] = self.X[i]
                    if self.p_fit[i] > self.fit:  # 更新全局最优
                        self.gbest = self.X[i]
                        self.fit = self.p_fit[i]
            for i in range(self.pN):
                self.V[i] = self.w * self.V[i] + self.c1 * self.r1 * (self.pbest[i] - self.X[i]) + \
                            self.c2 * self.r2 * (self.gbest - self.X[i])
                self.X[i] = self.X[i] + self.V[i]
                
                #约束参数数据类型
                self.X[i][0] = int(self.X[i][0])
                self.X[i][1] = int(self.X[i][1])
                self.X[i][2] = int(self.X[i][2])
                self.X[i][3] = int(self.X[i][3])
                
                #约束参数最小值
                if self.X[i][0] < 50:
                    self.X[i][0] = 50
                if self.X[i][1] < 1:
                    self.X[i][1] = 1
                if self.X[i][2] < 2:
                    self.X[i][2] = 2
                if self.X[i][3] < 1:
                    self.X[i][3] = 1
                
            fitness.append(self.fit)
            #print(self.X[0], end=" ")
            #print(self.fit)  # 输出最优值
        return fitness,self.X[0]
    

def RF_fun(x,y,predict):
    my_pso = RF_PSO(pN=20, dim=4, max_iter=10, train_x=x,train_y=y)
    my_pso.init_Population()
    fitness,params = my_pso.iterator()

    Model = ensemble.RandomForestRegressor(
                    n_estimators=int(params[0]),
                    criterion='mse',
                    max_depth=int(params[1]),
                    min_samples_split=int(params[2]),
                    min_samples_leaf=int(params[3]),
                    min_weight_fraction_leaf=0.0,
                    max_features='auto',
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.0,
                    min_impurity_split=None,
                    bootstrap=True,
                    oob_score=False,
                    n_jobs=None,
                    random_state=None,
                    verbose=0,
                    warm_start=False,
                    ccp_alpha=0.0,
                    max_samples=None,
                )
    Model.fit(x,y)

    predict_return = Model.predict(predict.values)
    return predict_return

# ADA_PSO
    用于Adaboost调参

In [161]:
# ----------------------PSO参数设置---------------------------------
class ADA_PSO():
    def __init__(self, pN, dim, max_iter, train_x, train_y):
        self.w = 0.8    #设置较大的W和较小的C避免负参数
        self.c1 = 2
        self.c2 = 2
        self.r1 = 0.6
        self.r2 = 0.3
        self.pN = pN  # 粒子数量
        self.dim = dim  # 搜索维度
        self.max_iter = max_iter  # 迭代次数
        self.train_x = train_x
        self.train_y = train_y
        self.X = np.zeros((self.pN, self.dim))  # 所有粒子的位置和速度
        self.V = np.zeros((self.pN, self.dim))
        self.pbest = np.zeros((self.pN, self.dim))  # 个体经历的最佳位置和全局最佳位置
        self.gbest = np.zeros((1, self.dim))
        self.p_fit = np.zeros(self.pN)  # 每个个体的历史最佳适应值
        self.fit = 0.6# 全局最佳适应值

    # ---------------------目标函数Sphere函数-----------------------------
    def function(self, X, train_x, train_y):
        # 适应度函数为model的CV3平均值
        Model = ensemble.AdaBoostRegressor(
                            base_estimator=None,
                            n_estimators=X[0],
                            learning_rate=X[1],
                            loss='linear',
                            random_state=None,
                        )
        cv_scores = model_selection.cross_validate(Model,
                                                   x,
                                                   y,
                                                   groups=None,
                                                   scoring="r2",  #适应度函数
                                                   cv=3,
                                                   verbose=0,
                                                   fit_params=None,
                                                   pre_dispatch='2*n_jobs',
                                                   return_train_score=False,
                                                   return_estimator=False,
                                                   )
                                                       
        return cv_scores["test_score"].mean()  #5个交叉验证的均值

    
    

    # ---------------------初始化种群----------------------------------
    def init_Population(self):
        for i in range(self.pN):
            
            self.X[i][0] = random.randint(50,150)  
            self.V[i][0] = random.randint(10,20)  
            
            self.X[i][1] = random.uniform(0.1,1.2)
            self.V[i][1] = random.uniform(0.1,0.5)
            
            self.pbest[i] = self.X[i]
            tmp = self.function(self.X[i],self.train_x,self.train_y)
            self.p_fit[i] = tmp
            if tmp > self.fit:
                self.fit = tmp
                self.gbest = self.X[i]

                # ----------------------更新粒子位置----------------------------------

    def iterator(self):
        fitness = []
        for t in range(self.max_iter):
            for i in range(self.pN):  # 更新gbest\pbest
                temp = self.function(self.X[i],self.train_x,self.train_y)
                if temp > self.p_fit[i]:  # 更新个体最优
                    self.p_fit[i] = temp
                    self.pbest[i] = self.X[i]
                    if self.p_fit[i] > self.fit:  # 更新全局最优
                        self.gbest = self.X[i]
                        self.fit = self.p_fit[i]
            for i in range(self.pN):
                self.V[i] = self.w * self.V[i] + self.c1 * self.r1 * (self.pbest[i] - self.X[i]) + \
                            self.c2 * self.r2 * (self.gbest - self.X[i])
                self.X[i] = self.X[i] + self.V[i]
                
                #约束参数数据类型
                self.X[i][0] = int(self.X[i][0])
                
                #约束参数最小值
                if self.X[i][0] < 50:
                    self.X[i][0] = 50
                if self.X[i][1] < 0:
                    self.X[i][1] = 0.1
                
            fitness.append(self.fit)
            #print(self.X[0], end=" ")
            #print(self.fit)  # 输出最优值
        return fitness,self.X[0]
    

def ADA_fun(x,y,predict):
    my_pso = ADA_PSO(pN=20, dim=2, max_iter=10, train_x=x,train_y=y)
    my_pso.init_Population()
    fitness,params = my_pso.iterator()

    Model = ensemble.AdaBoostRegressor(
                            base_estimator=None,
                            n_estimators=int(params[0]),
                            learning_rate=params[1],
                            loss='linear',
                            random_state=None,
                        )
    Model.fit(x,y)

    predict_return = Model.predict(predict.values)
    return predict_return

# GBDT_PSO
    用于GBDT调参

In [162]:
# ----------------------PSO参数设置---------------------------------
class GBDT_PSO():
    def __init__(self, pN, dim, max_iter, train_x, train_y):
        self.w = 0.8    #设置较大的W和较小的C避免负参数
        self.c1 = 2
        self.c2 = 2
        self.r1 = 0.6
        self.r2 = 0.3
        self.pN = pN  # 粒子数量
        self.dim = dim  # 搜索维度
        self.max_iter = max_iter  # 迭代次数
        self.train_x = train_x
        self.train_y = train_y
        self.X = np.zeros((self.pN, self.dim))  # 所有粒子的位置和速度
        self.V = np.zeros((self.pN, self.dim))
        self.pbest = np.zeros((self.pN, self.dim))  # 个体经历的最佳位置和全局最佳位置
        self.gbest = np.zeros((1, self.dim))
        self.p_fit = np.zeros(self.pN)  # 每个个体的历史最佳适应值
        self.fit = 0.6# 全局最佳适应值

    # ---------------------目标函数Sphere函数-----------------------------
    def function(self, X, train_x, train_y):
        # 适应度函数为model的CV3平均值
        Model = ensemble.GradientBoostingRegressor(
                                            loss='ls',
                                            learning_rate=X[0],
                                            n_estimators=100,
                                            subsample=1.0,
                                            criterion='friedman_mse',
                                            min_samples_split=X[1],
                                            min_samples_leaf=X[2],
                                            min_weight_fraction_leaf=0.0,
                                            max_depth=X[3],
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            init=None,
                                            random_state=None,
                                            max_features=None,
                                            alpha=0.9,
                                            verbose=0,
                                            max_leaf_nodes=None,
                                            warm_start=False,
                                            validation_fraction=0.1,
                                            n_iter_no_change=None,
                                            tol=0.0001,
                                            ccp_alpha=0.0,
                                        )
        cv_scores = model_selection.cross_validate(Model,
                                                   x,
                                                   y,
                                                   groups=None,
                                                   scoring="r2",  #适应度函数
                                                   cv=3,
                                                   verbose=0,
                                                   fit_params=None,
                                                   pre_dispatch='2*n_jobs',
                                                   return_train_score=False,
                                                   return_estimator=False,
                                                   )
                                                       
        return cv_scores["test_score"].mean()  #5个交叉验证的均值

    
    

    # ---------------------初始化种群----------------------------------
    def init_Population(self):
        for i in range(self.pN):
            
            self.X[i][0] = random.uniform(0.1,1.2)  
            self.V[i][0] = random.uniform(0.1,0.5)
            
            self.X[i][1] = random.randint(5,50)
            self.V[i][1] = random.randint(1,5)
            
            self.X[i][2] = random.randint(5,50)
            self.V[i][2] = random.randint(1,5)
            
            self.X[i][3] = random.randint(1,10)
            self.V[i][3] = random.randint(1,3)
            
            self.pbest[i] = self.X[i]
            tmp = self.function(self.X[i],self.train_x,self.train_y)
            self.p_fit[i] = tmp
            if tmp > self.fit:
                self.fit = tmp
                self.gbest = self.X[i]

                # ----------------------更新粒子位置----------------------------------

    def iterator(self):
        fitness = []
        for t in range(self.max_iter):
            for i in range(self.pN):  # 更新gbest\pbest
                temp = self.function(self.X[i],self.train_x,self.train_y)
                if temp > self.p_fit[i]:  # 更新个体最优
                    self.p_fit[i] = temp
                    self.pbest[i] = self.X[i]
                    if self.p_fit[i] > self.fit:  # 更新全局最优
                        self.gbest = self.X[i]
                        self.fit = self.p_fit[i]
            for i in range(self.pN):
                self.V[i] = self.w * self.V[i] + self.c1 * self.r1 * (self.pbest[i] - self.X[i]) + \
                            self.c2 * self.r2 * (self.gbest - self.X[i])
                self.X[i] = self.X[i] + self.V[i]
                
                #约束参数数据类型
                
                self.X[i][1] = int(self.X[i][1])
                self.X[i][2] = int(self.X[i][2])
                self.X[i][3] = int(self.X[i][3])
                
                #约束参数最小值
                if self.X[i][0] < 0:
                    self.X[i][0] = 0.1
                if self.X[i][1] < 2:
                    self.X[i][1] = 2
                if self.X[i][2] < 2:
                    self.X[i][2] = 2
                if self.X[i][3] < 1:
                    self.X[i][3] = 1
                
            fitness.append(self.fit)
            #print(self.X[0], end=" ")
            #print(self.fit)  # 输出最优值
        return fitness,self.X[0]
    

def GBDT_fun(x,y,predict):
    my_pso = GBDT_PSO(pN=20, dim=4, max_iter=10, train_x=x,train_y=y)
    my_pso.init_Population()
    fitness,params = my_pso.iterator()

    Model = ensemble.GradientBoostingRegressor(
                                loss='ls',
                                learning_rate=params[0],
                                n_estimators=100,
                                subsample=1.0,
                                criterion='friedman_mse',
                                min_samples_split=int(params[1]),
                                min_samples_leaf=int(params[2]),
                                min_weight_fraction_leaf=0.0,
                                max_depth=int(params[3]),
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                init=None,
                                random_state=None,
                                max_features=None,
                                alpha=0.9,
                                verbose=0,
                                max_leaf_nodes=None,
                                warm_start=False,
                                validation_fraction=0.1,
                                n_iter_no_change=None,
                                tol=0.0001,
                                ccp_alpha=0.0,
                            )
    Model.fit(x,y)

    predict_return = Model.predict(predict.values)
    return predict_return

In [1]:
file = '/Users/cr/Downloads/undergraduate time/大四上/FIN 4998/paper/code/机器学习/'
import pickle
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
import random
import time
from sklearn import preprocessing,tree,ensemble,model_selection,metrics

import warnings
warnings.filterwarnings('ignore')

In [2]:
tdic = pickle.load(open(file+'trade_dic','rb'))
df = pickle.load(open(file+'ml_data','rb'))

In [6]:
#数据的汇总统计描述
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
return,398240.0,0.011365,1.565157e-01,-8.635171e-01,-0.067831,0.000000,0.069347,6.399758e+00
peTTM,402643.0,96.135719,1.249271e+04,-7.694016e+05,16.921905,34.652096,66.792843,2.801318e+06
pbMRQ,402638.0,5.967251,3.108408e+02,-1.021808e+05,1.885265,3.013166,5.038959,2.178802e+04
pcfNcfTTM,402604.0,-452.140136,1.470764e+05,-3.065053e+07,-37.580041,6.663400,45.683559,3.492290e+07
psTTM,401967.0,15.317529,7.623575e+02,-2.797948e+04,1.609981,3.360850,7.022415,1.371679e+05
...,...,...,...,...,...,...,...,...
q_op_qoq_TTM,191209.0,106.380815,6.750882e+03,-4.045831e+05,0.755875,22.414275,86.403900,4.190297e+05
q_profit_yoy_TTM,181698.0,-13045.824431,1.594203e+06,-1.961631e+08,-26.501675,16.126725,71.120087,3.645647e+05
q_profit_qoq_TTM,191244.0,193.095371,1.364364e+04,-6.849547e+05,2.226937,23.479350,87.733812,1.129425e+06
q_netprofit_yoy_TTM,181704.0,33.109702,5.348248e+03,-2.571226e+05,-26.350563,16.247050,70.956625,3.597948e+05


# 按时间截面标准化数据

In [7]:
daterange = [x[1] for x in df.index.tolist()]
daterange = list(set(daterange))
daterange.sort()

In [15]:
sdf = pd.DataFrame()
for d in tqdm(daterange):
    temp = df.loc[(slice(None),d),:]
    temp.iloc[:,1:] = preprocessing.StandardScaler().fit_transform(temp.iloc[:,1:].values)
    
    sdf = pd.concat([sdf,temp])

100%|██████████| 133/133 [00:24<00:00,  5.36it/s]


In [18]:
del df
gc.collect()

1539

In [174]:
fdf_dic = {}
sec_dic = {}
for day in tqdm(daterange[12:]):
    
    i = daterange.index(day)
    traindate = daterange[i-12:i]
    temp = sdf.loc[(slice(None),traindate),:]

    temp = temp.dropna(subset=['return'],axis=0)

    drop_ = pd.DataFrame(temp.describe().T['count'] / len(temp) < 0.95)   #删除缺失大于5%的
    temp = temp.drop(labels=list(drop_.query('count==True').index),axis=1)    
    temp = temp.dropna(how='any',axis=0) #删除缺失样本

    y = temp['return'].values.reshape(-1,1)
    xdf = temp.drop(labels=['return'],axis=1)
    x = xdf.values

    fdf = factor_select(x,y)


    pdf = sdf.loc[(slice(None),day),list(fdf['feature'])] #取排名前10的特征
    drop_ = pd.DataFrame(pdf.describe().T['count'] / len(pdf) < 0.2)    #删除缺失过多的特征
    pdf = pdf.drop(labels=list(drop_.query('count==True').index),axis=1)
    pdf = pdf.dropna(how='any',axis=0)  #删除缺失数据
    
    xdf = xdf[list(pdf.columns)]  #
    x = xdf.values

    rf_predict = RF_fun(x,y,pdf)
    ada_predict = ADA_fun(x,y,pdf)
    gbdt_predict = GBDT_fun(x,y,pdf)

    pdf['RF'],pdf['ADA'],pdf['GBDT'] = rf_predict,ada_predict,gbdt_predict
    pdf['predict'] = pdf['RF'] + pdf['ADA'] + pdf['GBDT']
    pdf = pdf.sort_values(by='predict',ascending=False)
    sort_sec = [x[0] for x in pdf.index.tolist()]
    
    fdf_dic[day] = fdf
    sec_dic[day] = sort_sec

100%|██████████| 121/121 [46:37<00:00, 23.12s/it]


In [179]:
import pickle
pickle.dump(fdf_dic,open(file+'fea_imp','wb'))
pickle.dump(sec_dic,open(file+'sec','wb'))

In [None]:
###sec：
###key：时间截面
###时间截面对应的股票收益率列表

In [2]:
import pickle
fea_imp = pickle.load(open(file+'fea_imp','rb'))

In [6]:
##fea_imp
imp_file = file+'imp/'
sum_fea = pd.DataFrame()
for k in fea_imp.keys():
    fea_imp[k].to_excel(imp_file+k+'.xls')
    sum_fea = pd.concat([sum_fea,fea_imp[k]])

In [9]:
ssf = pd.DataFrame(sum_fea['importances'].groupby(sum_fea['feature']).sum())
ssf = ssf.sort_values(by='importances',ascending=False)

In [12]:
print('使用到的特征的总数：',ssf.shape[0])

使用到的特征的总数： 115


In [14]:
ssf.to_excel(file+'特征重要度汇总.xls')

In [16]:
ssf[:20]

Unnamed: 0_level_0,importances
feature,Unnamed: 1_level_1
pbMRQ,18.355171
q_ocf_to_sales,11.048004
psTTM,9.978842
inv_turn,8.008618
pcfNcfTTM,7.2557
ar_turn,6.901118
peTTM,6.184212
skew20,4.667628
rev20,2.793402
ocf_yoy,2.219009
