In [165]:
import pandas as pd
import numpy as np
from PyFin.api import *
import pickle
import itertools
import sys
sys.path.append('../..')
from ultron.factor.genetic.genetic_factors import GeneticFactors
from ultron.factor.combine.combine_engine import CombineEngine

In [166]:
def sigmoid(x):
    s = 1 / (1 + np.exp(-x))
    return s

def tanh(x):
    s1 = np.exp(x) - np.exp(-x)
    s2 = np.exp(x) + np.exp(-x)
    s = s1 / s2
    return s

def relu(x):
    s = np.where(x < 0, 0, x)
    return s

In [167]:
##构造5个算子
accumulators = {2:np.log,3:np.sqrt,4:np.fabs,5:tanh, 6:sigmoid}

In [168]:
with open('factor_data.pkl','rb') as file2:
    total_data = pickle.load(file2)

In [169]:
factor_columns = [i for i in list(set(total_data.columns)) if i not in ['trade_date','code','ret']]

In [170]:
point = int(np.random.uniform(0, len(factor_columns))/2)
ori_field = factor_columns[:point]
add_field = factor_columns[point:]
evalue_cols = np.array(ori_field + add_field)

In [171]:
_del_prob = 0.2
_add_prob = 0.2
_change_prob = 0.9

In [172]:
def ga_generate_ori(ori_field, add_field):
    ori_list = (np.random.uniform(0, 1, (1, len(ori_field))) < _del_prob).tolist()[0]
    add_list = (np.random.uniform(0, 1, (1, len(add_field))) < _add_prob).tolist()[0]
    new_list = ori_list + add_list
    new_list = [1 if i> 0 else 0 for i in new_list]
    return np.array(new_list)

In [173]:
ori_group = {i:ga_generate_ori(ori_field, add_field) for i in range(5)}

In [174]:
new_dict = ori_group.copy()
score = 1.0 / len(ori_group)
dict_score = {k:score for k in ori_group.keys()}
g, p = np.array([[k, v] for k, v in dict_score.items()]).T
flag = max(ori_group.keys())

In [175]:
cross_group = np.random.choice(g, size = int(len(g)/2), p = p, replace= False)
for (fa,mo) in itertools.combinations(cross_group, 2):
    flag += 1
    fa_code, mo_code = ori_group[fa], ori_group[mo]
    cut_point = np.random.randint(1, len(fa_code)-1)
    fa_code0, fa_code1 = fa_code[:cut_point], fa_code[cut_point:]
    mo_code0, mo_code1 = mo_code[:cut_point], mo_code[cut_point:]
    new1 = np.hstack([fa_code0, mo_code1])
    prob = np.random.uniform(0, 1)
    print(prob,_change_prob)
    if prob < _change_prob:
        change_point = np.random.randint(0, len(fa_code))
        ## 改变该点的值
        new1[change_point] = not new1[change_point] 
        new1 = [int(np.random.uniform(0, len(accumulators) + 2)) if i> 0 else 0 for i in new1] #所有的显性特征进行随机变异
    new_dict[flag] = np.array(new1)

0.16653514598973573 0.9


In [176]:
tp = [True if i> 0 else False for i in new_dict[5]] 
evalue_cols[tp]

array(['alpha_151', 'alpha_108', 'alpha_137', 'alpha_23', 'alpha_167',
       'alpha_181', 'alpha_54', 'alpha_107', 'alpha_28', 'alpha_123',
       'alpha_126', 'alpha_34', 'alpha_152', 'alpha_66', 'alpha_124',
       'alpha_78', 'alpha_180', 'alpha_14', 'alpha_58', 'alpha_79',
       'alpha_80', 'alpha_187', 'alpha_13', 'alpha_104', 'alpha_117',
       'alpha_147', 'alpha_27', 'alpha_98', 'alpha_125', 'alpha_67',
       'alpha_110', 'alpha_166', 'alpha_179', 'alpha_19', 'alpha_100',
       'alpha_12', 'alpha_143', 'alpha_111'], dtype='<U21')

In [177]:
def calc_evalue_group(t_group, evalue_cols, total_data):
    index = 0
    res = {}
    cols = []
    for i in t_group:
        if i > 0:
            factor_name = evalue_cols[index]
            factor_data = total_data[factor_name].copy()
            if i > 1:
                func = accumulators[i]
                factor_data = func(factor_data)
            res[factor_name + 'c_' + str(i)] = factor_data.fillna(0).values
            cols.append(factor_name + 'c_' + str(i))
        index += 1
    #factor_data = pd.DataFrame(res)
    #cols = factor_data.columns
    return cols, res
    ## 个体评分
    #factor_data[cols].mean(axis=1).values

In [178]:
def calc_ic(factor_df, return_df, factor_list, return_col_name='target_return', ic_type='spearman'):
    """
    计算因子IC值, 本月和下月因子值的秩相关
    params:
            factor_df: DataFrame, columns=['ticker', 'tradeDate', factor_list]
            return_df: DataFrame, colunms=['ticker, 'tradeDate'， return_col_name], 预先计算好的未来的收益率
            factor_list:　list， 需要计算IC的因子名list
            return_col_name: str, return_df中的收益率列名
            method: : {'spearman', 'pearson'}, 默认'spearman', 指定计算rank IC('spearman')或者Normal IC('pearson')
    return:
            DataFrame, 返回各因子的IC序列， 列为: ['tradeDate', factor_list]
    """
    merge_df = factor_df.merge(return_df, on=['code', 'trade_date'])
    # 遍历每个因子，计算对应的IC
    factor_ic_list = []
    for factor_name in factor_list:
        tmp_factor_ic = merge_df.groupby(['trade_date']).apply(
            lambda x: x[[factor_name, return_col_name]].corr(method=ic_type).values[0, 1])
        tmp_factor_ic.name = factor_name
        factor_ic_list.append(tmp_factor_ic)
    factor_ic_frame = pd.concat(factor_ic_list, axis=1)
    factor_ic_frame.reset_index(inplace=True)
    return factor_ic_frame

In [179]:
def equal_combine(factor_df, factor_list, ):
    """
    等权法合成因子
    参数:
        factor_df: DataFrame, 待合成因子值
        factor_list: list, 待合成因子列表
    返回:
        DataFrame, 复合因子
    """
    factor_df = factor_df.copy()
    factor_df['conmbine'] = factor_df[factor_list].mean(axis=1).values
    return np.corrcoef(factor_df['conmbine'].fillna(0).values, factor_df['ret'].fillna(0).values)[0,1]

In [180]:
tres = {} # # 新种群有变异后的特征也有父类特征
score_dict = {}
cols_dict = {}
for g, code in new_dict.items():
    cols, res = calc_evalue_group(new_dict[g], evalue_cols, total_data)
    ##特征评分
    score = equal_combine(pd.concat([pd.DataFrame(res),total_data[['code','trade_date','ret']]],axis=1), cols, )
    score_dict[g] = score
    cols_dict[g] = cols
    tres = dict(tres, **res)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  
  import sys


In [108]:
index = 0
res = {}
cols = []
for i in new_dict[5]:
    if i > 0:
        factor_name = evalue_cols[index]
        factor_data = total_data[factor_name].copy()
        if i > 1:
            func = accumulators[i]
            factor_data = func(factor_data)
        res[factor_name + 'c_' + str(i)] = factor_data.fillna(0).values
        cols.append(factor_name + 'c_' + str(i))
    index += 1

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  
  # Remove the CWD from sys.path while we load stuff.


In [186]:
new_list = [True if i>0 else False for i in new_dict[5]]
sub_data = total_data[evalue_cols[new_list]].T

In [187]:
s = new_dict[5][new_dict[5] > 0]
s

array([1, 6, 4, 4, 6, 1, 6, 3, 2, 2, 6, 4, 4, 1, 4, 5, 2, 1, 3, 1, 4, 2,
       6, 6, 5, 6, 5, 6, 3, 2, 5, 3, 4, 3, 6, 3, 5, 2])

In [188]:
t = evalue_cols[new_list]
t

array(['alpha_151', 'alpha_108', 'alpha_137', 'alpha_23', 'alpha_167',
       'alpha_181', 'alpha_54', 'alpha_107', 'alpha_28', 'alpha_123',
       'alpha_126', 'alpha_34', 'alpha_152', 'alpha_66', 'alpha_124',
       'alpha_78', 'alpha_180', 'alpha_14', 'alpha_58', 'alpha_79',
       'alpha_80', 'alpha_187', 'alpha_13', 'alpha_104', 'alpha_117',
       'alpha_147', 'alpha_27', 'alpha_98', 'alpha_125', 'alpha_67',
       'alpha_110', 'alpha_166', 'alpha_179', 'alpha_19', 'alpha_100',
       'alpha_12', 'alpha_143', 'alpha_111'], dtype='<U21')

In [190]:
x = pd.DataFrame(np.vstack((s, t))).T
x = x.rename(columns={0:'acc',1:'factor'}).set_index('factor')
x

Unnamed: 0_level_0,acc
factor,Unnamed: 1_level_1
alpha_151,1
alpha_108,6
alpha_137,4
alpha_23,4
alpha_167,6
alpha_181,1
alpha_54,6
alpha_107,3
alpha_28,2
alpha_123,2


In [199]:
%%time
res = []
for index, row in sub_data.iterrows():
    acc_value = x.loc[index].values[0]
    if int(acc_value) > 1:
        new_row = accumulators[acc_value](row)

KeyError: '6'

In [200]:
accumulators[6]

<function __main__.sigmoid(x)>