# GPLearnFinance3D 功能展示

必要说明:
1. 这个库还支持训练集和验证集的划分。
2. 支持通过表达式去筛选符合要求的因子(这个功能是在基础的baseIC的基础上开发的，想到了给不懂代码的人使用直接用字符串能灵活的设计指标)[PS:所有的指标最后都会取交集，也就是说如果你要求训练集IC>0.02,验证集的IC>0.01那么就会返回同时满足这两种的结果]
3. 新功能能基于GPLearn自由地选择算子的参数，比如std_8，还是std_5 都是由算法自己去遗传出来。

In [1]:
### 导入相关内容
import numpy as np
import pandas as pd
import genetic
from IPython.core.interactiveshell import InteractiveShell
import warnings
from scipy import stats

np.random.seed(10)
pd.set_option('display.max_columns', None)
pd.set_option('expand_frame_repr', True)
pd.set_option('display.unicode.ambiguous_as_wide', True)
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = "all"
import random

from add_ts_function import dynamic_ts_std, dynamic_ts_mean,dynamic_ts_max
from functions import _function_map
from add_ts_function import _extra_function_map
import os

In [2]:
def make_XY(df, index_name, columns_name, Y_column1,):
    '''
    return: X: ndarray[n_dates, n_feature, n_stocks], Y: ndarray[n_dates, n_stocks], X_feature_names
    '''
    df = df.pivot_table(index=[index_name], columns=[columns_name], sort=True, dropna=False)
    Y1 = df.loc[:,(Y_column1,)].to_numpy(dtype=np.double)

    df = df.drop([Y_column1,],axis=1)
    X_0_len = len(df.index)
    # df.columns.levels[0] not change after drop
    X_1_len = len(df.columns.levels[0]) - 1
    X_2_len = len(df.columns.levels[1])
    return df.to_numpy(dtype=np.double).reshape((X_0_len, X_1_len, X_2_len)), Y1, df.columns.levels[0].drop([Y_column1,])


In [3]:
stock_path = "D:\\FinanceStock\\FinanceDatabase_xbx\\stock-trading-data-pro\\"
all_stocks = os.listdir(stock_path)
numbers = 0
total_df = pd.DataFrame()
input_features = ["股票代码","交易日期","开盘价","最高价","最低价","收盘价","成交量","成交额","收益率"]
different_axis = ("交易日期","股票代码", "收益率",)
################################################################################################
# 构建训练集
total_train_df = pd.DataFrame()
begin_year = pd.Timestamp('2017-01-01')
stop_year = pd.Timestamp('2018-01-01')

# 构建验证集
total_eval_df = pd.DataFrame()
eval_begin_year = pd.Timestamp('2018-01-01')
eval_stop_year = pd.Timestamp('2019-01-01')

In [4]:
# 加载所有股票，这里为了演示就只加载了20只
for stock in all_stocks:
    if stock[:2]!='bj' and stock[:4]!='sh68' and stock[:4]!='sz30' and numbers<=20:
        numbers+=1
        train_df = pd.read_csv(stock_path+stock,encoding='gbk',skiprows=[0],parse_dates=["交易日期"])
        train_df = train_df[(train_df["交易日期"] < stop_year) & (train_df["交易日期"] >= begin_year)]
        # 验证集部分
        eval_df = pd.read_csv(stock_path + stock, encoding='gbk', skiprows=[0], parse_dates=["交易日期"])
        eval_df = eval_df[(eval_df["交易日期"] < eval_stop_year) & (eval_df["交易日期"] >= eval_begin_year)]
        if len(train_df)<=0 or len(eval_df)<=0:
            continue
        # 训练集部分
        train_df["收益率"] = train_df["收盘价"].shift(-1) / train_df["收盘价"] - 1
        train_df.dropna(subset=["收益率"],inplace=True)
        total_train_df = pd.concat([total_train_df,train_df],ignore_index=True)


        eval_df["收益率"] = eval_df["收盘价"].shift(-1) / eval_df["收盘价"] - 1
        eval_df.dropna(subset=["收益率"], inplace=True)
        total_eval_df = pd.concat([total_eval_df, eval_df], ignore_index=True)
        print(str(numbers)+'\r',end='')
total_train_df.reset_index(inplace=True,drop=True)
total_train_df = total_train_df[input_features]
train_X,train_Y, feature_names = make_XY(total_train_df, *different_axis)

total_eval_df.reset_index(inplace=True,drop=True)
total_eval_df = total_eval_df[input_features]
eval_X,eval_Y, _ = make_XY(total_eval_df, *different_axis)

21

In [5]:
X = np.concatenate([train_X,eval_X],axis=0)
Y = np.concatenate([train_Y,eval_Y],axis=0)
X_feature_names = feature_names
sample_weight = []
sample_weight.extend([1]*train_X.shape[0])
sample_weight.extend([0]*eval_X.shape[0])
sample_weight = np.array(sample_weight)

In [6]:

function_set_sample = ['common_add', 'common_sub', 'common_mul', 'common_div',
                       'common_log', 'common_sqrt', 'common_abs', 'common_inv', 'common_max', 'common_min', 'common_tan',] #'std_10'
my_function = [dynamic_ts_std, dynamic_ts_mean,dynamic_ts_max]
function_set = function_set_sample + my_function

In [7]:
# 这里的metric的填写是基于fitness.py 文件中的map里面的key值 时间问题这里就展示两个generation
gp_sample = genetic.SymbolicTransformer(generations=2,
                                        population_size=200,
                                        tournament_size=10,
                                        init_depth=(1, 3),
                                        hall_of_fame=100,
                                        n_components=10,
                                        function_set=function_set,
                                        metric="pearson_3d",
                                        const_range=(-1, 1),
                                        p_crossover=0.4,
                                        p_hoist_mutation=0.001,
                                        p_subtree_mutation=0.01,
                                        p_point_mutation=0.01,
                                        p_point_replace=0.4,
                                        parsimony_coefficient="auto",
                                        feature_names=X_feature_names,
                                        max_samples=1, verbose=1,
                                        random_state=0, n_jobs=-2)

#### 规定 范围: TRA 表示train OOB 表示样本外 TOT表示 total
#### 同时 所有的指标都在fitness 的 _extra_map中 想要使用这个功能只需要按照 “范围 标准”的格式就行了
#### 其中, 所谓标准的写法就是，指定fitness中的函数写一个表达式，这个表达式要能满足在" if expression:" 下也能正确表述即可.

In [8]:
gp_sample.fit_3D(X, Y,feature_names,sample_weight=sample_weight,standard_expression="TRA ((pearson_3d>=0.02) and (spearman_3d >=0.002)) OOB (pearson_3d>0.0002)",need_parallel=True)


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     4.62        0.0137286        6        0.0392091              N/A      1.14m
   1     4.96        0.0277804        7        0.0463986              N/A      0.00s


In [9]:
# 展示一下基于指定的表达式追踪下来的所有结果，result只有10个最后一代最好的结果，show_tracing会保留所有符合标准的结果
result = gp_sample.show_program(X,Y,sample_weight=sample_weight,feature_names=X_feature_names,baseIC=False,show_tracing=(True,"./show_tracing.csv"))
result.to_csv("./result_only10.csv")

In [12]:
# 挑选一个因子进行复现
# common_tan(common_min(收盘价, common_sub(成交量, 成交额))) -0.0392090667312741
total_train_df["min_factor"] =np.minimum(total_train_df["收盘价"],np.subtract(total_train_df["成交量"],total_train_df["成交额"]))
total_train_df["factor"] = np.tan(total_train_df["min_factor"])
result = total_train_df.groupby("交易日期").apply(lambda x : x["factor"].corr(x["收益率"])).mean()
result

-0.039209066731274084

In [14]:
# dynamic_ts_std(common_sub(成交额, 最高价),56) -0.03192147687530125
total_train_df["Factor1"] = np.subtract(total_train_df["成交额"],total_train_df["最高价"])
total_train_df["factor"] = total_train_df.groupby("股票代码").apply(lambda x :x["Factor1"].rolling(56).std()).reset_index()["Factor1"]
result = total_train_df.groupby("交易日期").apply(lambda x : x["factor"].corr(x["收益率"])).mean()
result

-0.03192147687530125