在沙盒环境，训练股票的机器学习模型

In [1]:
import os, copy, datetime
os.environ['ULTRON_DATA'] = 'minden'
import pandas as pd
import numpy as np

In [2]:
from ultron.env import *
from ultron.kdutils.file import dump_pickle
from ultron.factor.data.processing import factor_processing
from ultron.factor.data.standardize import standardize
from ultron.factor.data.winsorize import winsorize_normal
from ultron.factor.othgnz.othgnz_engine import OthgnzEngine
from ultron.factor.combine.combine_engine import CombineEngine
from ultron.optimize.model.linearmodel import LinearRegression

/var/log/ultron/2022-09-27.log


In [3]:
from ultron.sentry.Analysis.CrossSectionValueHolders import CSPercentileSecurityValueHolder
from ultron.sentry.Analysis.CrossSectionValueHolders import CSRankedSecurityValueHolder
from ultron.sentry.Analysis.SecurityValueHolders import SecurityDeltaValueHolder
from ultron.sentry.Analysis.SecurityValueHolders import SecurityLatestValueHolder
from ultron.sentry.Analysis.SecurityValueHolders import SecurityShiftedValueHolder
from ultron.sentry.Analysis.TechnicalAnalysis.StatelessTechnicalAnalysers import SecurityLogValueHolder
from ultron.sentry.Analysis.TechnicalAnalysis.StatelessTechnicalAnalysers import SecuritySignValueHolder
from ultron.sentry.Analysis.TechnicalAnalysis.StatefulTechnicalAnalysers import SecurityMovingMax
from ultron.sentry.Analysis.TechnicalAnalysis import SecurityMovingCorrelation
from ultron.sentry.Analysis.TechnicalAnalysis import SecurityMovingRank

In [4]:
enable_example_env()

2022-09-27 19:23:45,288 - [env.py:67] - ultron - INFO - enable example env will only read /home/kerry/ultron/rom/sandbox/minden


#### 加载指标数据

In [5]:
indicator_data = pd.read_csv(os.path.join(g_project_data, 'indicator.csv'), index_col=0)
indicator_data['trade_date'] = pd.to_datetime(indicator_data['trade_date'])
indicator_data = indicator_data.rename(columns={'preClosePrice':'pre_close','openPrice':'open',
                        'closePrice':'close','highestPrice':'high','lowestPrice':'low',
                        'turnoverVol':'volume'}).drop(['turnoverValue'],axis=1).set_index(['trade_date'])
indicator_data = indicator_data.sort_values(by=['trade_date','code']).reset_index()
indicator_data.head()

Unnamed: 0,trade_date,code,pre_close,open,high,low,close,volume,p_change,atr21,atr14,key,date_week
0,2018-10-15,300002,3.07,3.08,3.12,2.99,3.0,9880442,-0.023065,0.140328,0.140576,328,0
1,2018-10-15,300009,8.667,8.736,8.979,8.58,8.742,21076577,0.008616,0.391429,0.388621,329,0
2,2018-10-15,300012,5.363,5.433,5.711,5.433,5.592,22248128,0.041813,0.26268,0.255165,330,0
3,2018-10-15,300014,6.192,6.192,6.465,6.134,6.15,25110741,-0.006806,0.351984,0.36932,331,0
4,2018-10-15,300017,7.362,7.322,7.362,7.004,7.074,46583251,-0.039906,0.342257,0.344513,332,0


#### 加载行业数据

In [6]:
industry_data = pd.read_csv(os.path.join(g_project_data, 'industry_data.csv'), index_col=0)
industry_data['trade_date'] = pd.to_datetime(industry_data['trade_date'])
industry_data.head()

Unnamed: 0,trade_date,code,industryID1,industryName1
0,2020-08-17,300002,1030325,计算机
1,2020-08-17,300009,1030317,医药生物
2,2020-08-17,300012,1030328,综合
3,2020-08-17,300014,1030308,电气设备
4,2020-08-17,300017,1030327,通信


#### 构建5个alpha191因子作为当前组合因子

In [7]:
def Alpha191_1(data, interval=6):
    exp1 = CSPercentileSecurityValueHolder(
        SecurityDeltaValueHolder(1, SecurityLogValueHolder(SecurityLatestValueHolder('volume'))))
    exp2 = CSPercentileSecurityValueHolder((SecurityLatestValueHolder('close') - \
                SecurityLatestValueHolder('open')) / SecurityLatestValueHolder('open'))

    exp3 = -1 * SecurityMovingCorrelation(interval, exp1, CSRankedSecurityValueHolder(exp2))
    return exp3.transform(data.set_index('trade_date'),name='alpha191_1_' + str(interval) + 'D',
               category_field='code')

def Alpha191_2(data, interval=1):
    exp1 = SecurityLatestValueHolder('close'
                         ) * 2  - SecurityLatestValueHolder('low'
                                                           ) - SecurityLatestValueHolder('high')
    exp2 = SecurityLatestValueHolder('high') - SecurityLatestValueHolder('low')
    exp3 = SecurityDeltaValueHolder(interval, exp1 / exp2) * -1
    return exp3.transform(data.set_index('trade_date'),name='alpha191_2_' + str(interval) + 'D',
               category_field='code')

def Alpha191_5(data, interval=5):
    exp1 = SecurityMovingRank(interval, 'volume')
    exp2 = SecurityMovingRank(interval, 'high')
    exp3 = SecurityMovingCorrelation(interval, exp1 , exp2)
    exp4 = SecurityMovingMax(3, exp3) * -1
    return exp4.transform(data.set_index('trade_date'),name='alpha191_5_' + str(interval) + 'D',
               category_field='code')

def Alpha191_6(data, interval=4):
    exp1 = SecuritySignValueHolder(SecurityDeltaValueHolder(interval,
        SecurityLatestValueHolder('open') * 0.85 + SecurityLatestValueHolder('high') * 0.15))
    
    exp2 = CSPercentileSecurityValueHolder(exp1) * -1
    return exp2.transform(data.set_index('trade_date'),name='alpha191_6_' + str(interval) + 'D',
               category_field='code')


def Alpha191_14(data, interval=1):
    exp1 = SecurityLatestValueHolder('close') - \
SecurityShiftedValueHolder(interval, SecurityLatestValueHolder('close'))
    return exp1.transform(data.set_index('trade_date'),name='alpha191_14_' + str(interval) + 'D',
               category_field='code')

#### 计算因子值

In [8]:
alpha1 = Alpha191_1(indicator_data, interval=6).reset_index().set_index(['trade_date','code'])
alpha2 = Alpha191_2(indicator_data, interval=6).reset_index().set_index(['trade_date','code'])
alpha5 = Alpha191_5(indicator_data, interval=6).reset_index().set_index(['trade_date','code'])
alpha6 = Alpha191_6(indicator_data, interval=6).reset_index().set_index(['trade_date','code'])
alpha14 = Alpha191_14(indicator_data, interval=6).reset_index().set_index(['trade_date','code'])

  if __name__ == '__main__':


In [9]:
factors_data = pd.concat([alpha1,alpha2,alpha5,alpha6,alpha14],axis=1).dropna().reset_index()
factors_data.head()

Unnamed: 0,trade_date,code,alpha191_1_6D,alpha191_2_6D,alpha191_5_6D,alpha191_6_6D,alpha191_14_6D
0,2018-10-23,300002,-0.348808,-0.068376,-0.901504,-0.643939,0.29
1,2018-10-23,300009,0.23444,-0.293772,-0.993808,-0.643939,0.599
2,2018-10-23,300012,-0.439858,-0.186259,-0.947921,-0.643939,0.665
3,2018-10-23,300014,0.452402,-0.268342,-0.911465,-0.643939,0.1
4,2018-10-23,300017,-0.758462,-0.112536,-1.0,-0.643939,0.635


In [10]:
features = [col for col in factors_data.columns if col not in ['trade_date','code']]

In [11]:
diff_cols = ['trade_date','code']
features = [col  for col in factors_data.columns if col not in diff_cols]

In [12]:
neutralized_styles = industry_data['industryID1'].unique().tolist()
industry_dummy = pd.get_dummies(industry_data.set_index(['trade_date','code'])['industryID1']).reset_index()
industry_dummy.head()

Unnamed: 0,trade_date,code,1030303,1030305,1030308,1030309,1030310,1030312,1030314,1030317,1030318,1030324,1030325,1030326,1030327,1030328
0,2020-08-17,300002,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,2020-08-17,300009,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2020-08-17,300012,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2020-08-17,300014,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,2020-08-17,300017,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [13]:
factors_data = factors_data.merge(industry_dummy, on=['trade_date','code'])
factors_data.head()

Unnamed: 0,trade_date,code,alpha191_1_6D,alpha191_2_6D,alpha191_5_6D,alpha191_6_6D,alpha191_14_6D,1030303,1030305,1030308,...,1030310,1030312,1030314,1030317,1030318,1030324,1030325,1030326,1030327,1030328
0,2019-12-31,300002,0.3589,-0.914286,-0.488901,-0.287879,0.02,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2019-12-31,300009,-0.721964,-0.851203,-0.772881,-0.287879,0.056,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2019-12-31,300012,0.271081,-0.927395,-0.433861,-0.787879,0.399,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2019-12-31,300014,0.323163,-0.360846,-0.119523,-0.287879,-1.49,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,2019-12-31,300017,0.382712,-1.144134,-0.745356,-0.287879,0.199,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
alpha_res = []
grouped = factors_data.groupby(['trade_date'])
for k, g in grouped:
    new_factors = factor_processing(g[features].values,
                 pre_process=[winsorize_normal,standardize],
                 risk_factors=g[neutralized_styles].values.astype(float),
                 post_process=[standardize])
    f = pd.DataFrame(new_factors, columns=features)
    for k in diff_cols:
        f[k] = g[k].values
    alpha_res.append(f)

In [15]:
factors_data = pd.concat(alpha_res)
factors_data.tail()

Unnamed: 0,alpha191_1_6D,alpha191_2_6D,alpha191_5_6D,alpha191_6_6D,alpha191_14_6D,trade_date,code
61,0.979704,0.197876,-0.656967,-0.338423,-0.079586,2022-07-01,300630
62,-0.551795,0.482601,-0.85031,-0.338423,0.265403,2022-07-01,300676
63,-0.284215,1.434117,-0.831582,-0.338423,-0.511547,2022-07-01,300677
64,0.074113,0.865973,0.964553,-0.575319,0.420365,2022-07-01,300699
65,0.961641,-1.835359,0.221417,0.958865,-1.915221,2022-07-01,300724


##### 因子进行合成

In [16]:
symmetry_othgnz = OthgnzEngine.create_engine('symmetry')

In [17]:
factors_data = symmetry_othgnz(
    factors_data[diff_cols + features].fillna(0), 
    diff_cols).sort_values(by=['trade_date','code'])
factors_data.tail()

Unnamed: 0,alpha191_1_6D,alpha191_2_6D,alpha191_5_6D,alpha191_6_6D,alpha191_14_6D,trade_date,code
61,0.006042,0.000641,-0.004082,-0.002391,-0.000395,2022-07-01,300630
62,-0.002104,0.002427,-0.003903,-0.000705,0.000883,2022-07-01,300676
63,-0.001134,0.006945,-0.004063,-0.001316,-0.00289,2022-07-01,300677
64,0.000221,0.004452,0.005412,-0.002712,0.002319,2022-07-01,300699
65,0.003713,-0.009985,-0.000376,0.001508,-0.009989,2022-07-01,300724


#### 等权合成

In [18]:
equal_combine = CombineEngine.create_engine('equal_combine')

In [19]:
factors_df = factors_data.copy(deep=True)
factors_df['factor'] = equal_combine(factors_df, features)
equal_data = factors_df.copy(deep=True).drop(features,axis=1).sort_values(by=['trade_date','code'])
equal_data['trade_date'] = pd.to_datetime(equal_data['trade_date'])
equal_data.tail()

Unnamed: 0,trade_date,code,factor
61,2022-07-01,300630,-3.7e-05
62,2022-07-01,300676,-0.00068
63,2022-07-01,300677,-0.000492
64,2022-07-01,300699,0.001938
65,2022-07-01,300724,-0.003026


#### 构建模型训练

In [20]:
def returns(market_data):
    price_tb = market_data['close'].unstack()
    price_tb.fillna(method='pad', inplace=True)
    return_tb = np.log(price_tb.shift(-1) / price_tb)
    return_tb = return_tb.replace([np.inf, -np.inf], np.nan)
    return_tb = return_tb.stack().reindex(market_data.index)
    return_tb.name = 'nxt1_ret'
    return return_tb

In [21]:
rets = returns(indicator_data.set_index(['trade_date','code']))
rets.head()

trade_date  code  
2018-10-15  300002   -0.023610
            300009   -0.004241
            300012   -0.008982
            300014   -0.027697
            300017   -0.022733
Name: nxt1_ret, dtype: float64

In [22]:
train_data = equal_data.merge(rets, on=['trade_date','code'])
train_data = train_data.dropna(subset=['nxt1_ret'])
train_data.head()

Unnamed: 0,trade_date,code,factor,nxt1_ret
0,2019-12-31,300002,1.582482e-05,0.030214
1,2019-12-31,300009,-0.0003845148,0.00399
2,2019-12-31,300012,6.566349e-20,0.000672
3,2019-12-31,300014,0.001219467,0.005758
4,2019-12-31,300017,0.000148068,0.031983


In [23]:
model = LinearRegression(['factor'], fit_intercept=False)

In [24]:
model.fit(train_data, train_data['nxt1_ret'].values)

#### 保持模型

In [25]:
dump_pickle(model.save(), os.path.join(g_project_data, 'model.h5'))

please wait! dump_pickle....: /home/kerry/ultron/rom/sandbox/minden/model.h5
