In [1]:
import pandas as pd
import numpy as np
import pickle,itertools,sys,pdb
from alphamind.data.processing import factor_processing
from alphamind.data.standardize import standardize
from alphamind.data.winsorize import winsorize_normal
from ultron.factor.combine.combine_engine import CombineEngine
from ultron.factor.genetic.mutation_factors import GeneticMutationFactors
from ultron.factor.genetic.accumulators import transform
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 以等权合成因子的IC值作为分数判断 (注意:系统是以分数倒序排序来进行种群筛选，若类似IC带有方向性需转化为绝对值)
def equal_combine(factor_df, factor_list):
    factor_df = factor_df.copy()
    ndiff_field = [i for i in list(set(factor_df.columns)) if i not in factor_list]
    #合成前数据预处理
    alpha_res = []
    grouped = factor_df.groupby(['trade_date'])
    for k, g in grouped:
        ret_preprocess = factor_processing(g[factor_list].fillna(0).values,
                                       pre_process=[winsorize_normal, standardize])
        f = pd.DataFrame(ret_preprocess, columns=factor_list)
        for k in ndiff_field:
            f[k] = g[k].values
        alpha_res.append(f)
    total_data = pd.concat(alpha_res)
    total_data = factor_df
    total_data['conmbine'] = total_data[factor_list].mean(axis=1).values
    score = np.corrcoef(total_data['conmbine'].fillna(0).values, total_data['ret'].fillna(0).values)[0,1]
    return abs(score)

In [3]:
# 读取数据
with open('factor_data.pkl','rb') as file2:
    total_data = pickle.load(file2)
total_data.head(10)

Unnamed: 0,trade_date,code,ROEAfterNonRecurring,CHV,DROE,IVR,EPAfterNonRecurring,DROEAfterNonRecurring,CFinc1,alpha_1,...,alpha_184,alpha_185,alpha_186,alpha_187,alpha_188,alpha_189,alpha_190,alpha_191,ret,negMarketValue
0,2018-09-18,10,-105.734422,-0.660606,-202.841612,0.272982,-0.283813,-5.04279,-0.715046,-0.275493,...,1.833226,0.993045,47.56062,1.86,-45.363736,0.057778,-0.580667,-0.609058,0.010215,2542974000.0
1,2018-09-18,852,-1.402939,-0.733333,-123.396084,0.016266,-0.003703,58.969788,-0.715046,0.171365,...,1.618495,0.188641,75.607325,8.31,19.319093,0.335556,-1.094852,-1.018469,0.072496,6364398000.0
2,2018-09-18,2019,19.556671,0.273557,-18.1705,0.567898,0.097143,-4.336328,-0.715046,0.587883,...,0.450731,0.237612,59.255802,1.988267,-13.381817,0.348163,0.400129,-1.12534,0.033667,9262848000.0
3,2018-09-18,2278,0.313534,-0.866667,-77.894469,0.214677,0.001169,119.794377,-0.715046,-0.842086,...,1.202937,0.10142,51.405364,3.02,-21.761117,0.109167,0.43172,0.137349,-0.017744,2738049000.0
4,2018-09-18,2380,4.43459,-0.820672,-2.934867,0.540656,0.031446,-1.762149,-0.715046,-0.66543,...,1.039419,0.63228,20.042926,1.27,12.386432,0.072222,-0.409236,0.745425,0.020334,1744613000.0
5,2018-09-18,2468,20.605487,-0.915152,-0.400065,0.606586,0.05993,2.53971,0.346927,-0.013798,...,0.640416,0.208345,16.08162,5.2,10.875756,0.216944,-0.721486,-0.080343,0.119134,5454397000.0
6,2018-09-18,2624,17.53901,-0.515152,4.299128,0.321421,0.046763,3.378517,0.484389,-0.317983,...,0.177497,0.337294,21.211366,8.87,-4.847316,0.270833,-0.372736,0.686561,0.023912,11372010000.0
7,2018-12-03,300674,,-0.69697,,,0.012046,,-0.60276,0.53931,...,0.981496,0.029121,,,5.805438,1.696944,,,-0.221687,1509177000.0
8,2018-09-18,2850,1.569706,-0.522799,-64.921336,0.340336,0.008644,-61.21766,-0.715046,-0.191557,...,0.915306,0.452043,40.196448,2.56,-26.432105,0.376667,0.397108,-0.889158,0.064603,1795720000.0
9,2018-09-18,300004,-8.701008,-0.878788,-193.8755,0.229192,-0.136293,6.550788,-0.715046,-0.259448,...,0.548041,0.090409,33.340708,1.87,3.492951,0.096667,-0.427337,-0.962769,0.227645,1778908000.0


In [4]:
diff_filed = ['trade_date','code','ret'] # 非因子列
factor_columns = [i for i in list(set(total_data.columns)) if i not in ['trade_date','code','ret']] # 因子列

In [5]:
# 将原始特征进行分类，分位两类，一类较强类，一类较弱类。可通过前期处理比如IR分数分类等方法，此处采用随机分类
point = int(np.random.uniform(0, len(factor_columns))/2)
ori_field = factor_columns[:point]
add_field = factor_columns[point:]

In [6]:
# 定义遗传对象
mutation_factors = GeneticMutationFactors(0.2, #添加第一类特征概率
                                          0.2, #添加第二类特征概率
                                          0.9, #突变概率
                                          0.0000001,#收敛值大小，即子代最好种群和父代最好种群分数差值，若小于改值则停止繁衍
                                          generation=6, # 繁衍代数
                                          group_num=6, # 每代种群数
                                          objective=equal_combine)

In [7]:
#第一种返回最后一代最好前group_num种群
field_group = mutation_factors.genetic_run(total_data, diff_filed = diff_filed, strong_field = ori_field, 
                             weak_field = add_field, is_best=False)

种群数9, 种群均分:0.012954
繁衍代数:1,最好分数:0.046000,1和0最好组分数差值0.046000,1代最好最差种群分数差值0.394569
种群数9, 种群均分:0.018065
繁衍代数:2,最好分数:0.046000,2和1最好组分数差值0.000000,2代最好最差种群分数差值0.282933
种群数9, 种群均分:0.019936
繁衍代数:3,最好分数:0.046000,3和2最好组分数差值0.000000,3代最好最差种群分数差值0.233396
种群数9, 种群均分:0.020451
繁衍代数:4,最好分数:0.046000,4和3最好组分数差值0.000000,4代最好最差种群分数差值0.216192
种群数9, 种群均分:0.022442
繁衍代数:5,最好分数:0.046000,5和4最好组分数差值0.000000,5代最好最差种群分数差值0.174157
种群数9, 种群均分:0.023185
繁衍代数:6,最好分数:0.046000,6和5最好组分数差值0.000000,6代最好最差种群分数差值0.162474


In [53]:
#展示种群繁衍过程，并将因子变异过程转化为算子，用于样本外数据直接计算得到新因子
# transform(i,is_formula)# is_formula为False返回公式字符串，is_formula为True返回可执行公式
formula_group = {}
for k, g in field_group.items():
    formula_group[k] = [transform(i,is_formula=False) for i in g] 

In [54]:
#第18号种群基因特征及因子特征
formula_group[18]

["SecurityLatestValueHolder(CSRankedSecurityValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder('alpha_40'))))))",
 "SecurityLatestValueHolder(SecurityFloorValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder('alpha_80'))))))",
 "SecurityLatestValueHolder(CSPercentileSecurityValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder('alpha_67'))))))",
 "SecurityLatestValueHolder(SecurityDiffValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder('alpha_4'))))))",
 "SecurityLatestValueHolder(SecurityRoundValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder('alpha_161'))))))",
 "SecurityLatestValueHolder(CSPercentileSecurityValueHolder(SecurityLatestValueHolder(SecurityLatestValueHolder