在沙盒环境，使用机器学习方式对多因子进行合成，并构造因子加权组合

方法:
- 1.训练多个有效且相关度低模型，此处可参照GA训练模型方式.不同基础模型不同X特征数据维度
- 2.通过多个X特征维度预测出来的Y值作为，下一层模型的X输入值
- 3.为了例子方便,Y值统一使用下一期的绝对收益

In [1]:
import os,pdb,itertools,copy,datetime
os.environ['ULTRON_DATA'] = 'keim'

In [2]:
import pandas as pd
import numpy as np
from ultron.kdutils.parallel import delayed, Parallel
from ultron.factor.data.processing import factor_processing
from ultron.factor.data.standardize import standardize
from ultron.strategy.experimental.single_factor import SingleFactor
from ultron.optimize.model.linearmodel import LassoRegression
from ultron.optimize.model.linearmodel import LinearRegression
from ultron.optimize.model.linearmodel import BayesianRegression
from ultron.optimize.model.treemodel import RandomForestRegressor
from ultron.optimize.model.treemodel import LGBMRegressor
from ultron.optimize.model.treemodel import XGBRegressor

In [3]:
from ultron.env import *

/var/log/ultron/2022-09-27.log


In [4]:
enable_example_env()

2022-09-27 19:12:48,284 - [env.py:67] - ultron - INFO - enable example env will only read /home/kerry/ultron/rom/sandbox/keim


#### 加载行情数据

In [5]:
market_data = pd.read_csv(os.path.join(g_project_data, 'market_data.csv'), index_col=0)
market_data['trade_date'] = pd.to_datetime(market_data['trade_date'])
market_data.head()

Unnamed: 0,trade_date,code,openPrice,highestPrice,lowestPrice,closePrice,turnoverVol
0,2017-10-27,A,4462.578191,4463.801485,4413.646412,4435.665713,158774
1,2017-10-27,AL,15625.658581,15658.904663,15430.931529,15449.92929,293630
2,2017-10-27,BU,3310.339921,3336.950371,3283.729472,3302.356787,461826
3,2017-10-27,C,2009.751001,2014.561895,2001.331936,2002.53466,375480
4,2017-10-27,CF,20517.496003,20531.174333,20408.069357,20449.104349,84032


#### 选择中因子

In [6]:
sel_factor = pd.read_csv(os.path.join(g_project_data, 'sel_factor.csv'), index_col=0)
sel_factor = sel_factor.drop_duplicates(subset=['factor','window'])
sel_factor = sel_factor[(sel_factor['factor'] != 'inventory')]
sel_factor = sel_factor[(sel_factor['factor'] != 'profitratio')]
sel_factor.head()

Unnamed: 0,factor,window,weekday,bins
0,BM_MainFar_80D,23,5,5
1,BM_MainFar_80D,25,5,5
2,BM_MainFar_80D,27,5,5
3,BM_RecentFar_20D,5,1,5
4,BM_RecentFar_40D,3,1,3


#### 读取因子

In [7]:
total_data = pd.read_csv(os.path.join(g_project_data, 'factor.csv'), index_col=0)
total_data['trade_date'] = pd.to_datetime(total_data['trade_date'] )
factor_data = total_data[['trade_date','code'] + sel_factor['factor'].unique().tolist()]
factor_data.head()

Unnamed: 0,trade_date,code,BM_MainFar_80D,BM_RecentFar_20D,BM_RecentFar_40D,BM_RecentFar_80D,BM_RecentSecond_20D,BM_RecentSecond_40D,B_FarSpot,B_MainSpot,...,R_UpVolatility_1_40D,R_UpVolatility_1_60D,TS_MainFar,TS_RecentFar,TS_RecentSecond,T_DnIntraday_5D,T_DnVolatility_1_10D,T_DnVolatility_2_20D,WeightNetIntTotalChg5D,WeightShortVolRelTotIntChg
0,2017-10-27,A,-0.033259,-0.026646,-0.019436,-0.041974,-0.023047,-0.013509,-0.042729,0.002378,...,-0.00393,-0.005081,-0.05761,-0.079619,-0.104757,-0.00835,-0.007715,-0.002168,-0.000633,-0.037579
1,2017-10-27,AL,-0.001423,0.001697,-0.000937,0.000587,0.001133,-0.000539,-0.076121,-0.084726,...,-0.010231,-0.012803,-0.069381,-0.068413,-0.067663,-0.005843,-0.008381,0.000165,-0.000352,-0.012891
2,2017-10-27,BU,-0.016537,0.059635,-0.032271,-0.034618,0.069999,-0.027086,-0.124574,-0.321128,...,-0.010399,-0.012054,-0.102761,-0.124225,-0.159247,-0.005098,-0.009538,0.001268,0.002481,0.275875
3,2017-10-27,C,0.007939,-0.005224,-0.014003,0.025361,-0.001541,-0.007955,-0.006522,0.172635,...,-0.004324,-0.004901,-0.063351,-0.072537,-0.092543,-0.002866,-0.003349,-0.000979,0.002547,0.245555
4,2017-10-27,CF,-0.02396,0.002346,-0.028774,-0.008043,0.004838,-0.009858,0.05664,0.311288,...,-0.006368,-0.006944,-0.025782,-0.024471,-0.003359,-0.003975,-0.004373,-0.00142,-0.000643,-0.131799


#### 时序标准化

In [8]:
parallel = Parallel(n_jobs=1, verbose=1, pre_dispatch='2*n_jobs')

In [9]:
def _build(total_data, window, columns):
    sf = SingleFactor(factor_data=None, market_data=None, codes=None, columns=None)
    normalize_data = sf.normalize(factor_data=total_data.dropna(subset=columns), 
                                    windows=window, columns=columns)
    normalize_data = normalize_data.sort_values(by=['trade_date','code'])
    return normalize_data.set_index(['trade_date','code']).rename(
        columns={columns[0]:columns[0] + '_' + str(window)})

In [10]:
out = parallel(delayed(_build)(factor_data[['trade_date','code', v['factor']]],
                   window=v['window'], columns=[v['factor']]) for v in sel_factor.to_dict(orient='records'))

In [11]:
factors_data = pd.concat(out, axis=1).reset_index().fillna(0)
factors_data.head()

Unnamed: 0,trade_date,code,BM_MainFar_80D_23,BM_MainFar_80D_25,BM_MainFar_80D_27,BM_RecentFar_20D_5,BM_RecentFar_40D_3,BM_RecentFar_40D_9,BM_RecentFar_40D_11,BM_RecentFar_40D_13,...,TS_MainFar_5,TS_RecentFar_7,TS_RecentSecond_7,T_DnIntraday_5D_23,T_DnVolatility_1_10D_21,T_DnVolatility_2_20D_25,T_DnVolatility_2_20D_27,WeightNetIntTotalChg5D_11,WeightNetIntTotalChg5D_13,WeightShortVolRelTotIntChg_9
0,2017-10-31,A,0.0,0.0,0.0,0.0,-0.397406,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2017-10-31,AL,0.0,0.0,0.0,0.0,0.429146,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-10-31,BU,0.0,0.0,0.0,0.0,1.145945,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-10-31,C,0.0,0.0,0.0,0.0,1.127665,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2017-10-31,CF,0.0,0.0,0.0,0.0,1.040272,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 横截面标准化

期货品种不做去极值和中性化

In [12]:
diff_cols = ['trade_date','code']
features = [col for col in factors_data.columns if col not in diff_cols]

In [13]:
alpha_res = []
grouped = factors_data.groupby(['trade_date'])
for k, g in grouped:
    new_factors = factor_processing(g[features].values,
                 pre_process=[standardize])
    f = pd.DataFrame(new_factors, columns=features)
    for k in diff_cols:
        f[k] = g[k].values
    alpha_res.append(f)

  return (x - simple_mean(x, axis=0)) / np.maximum(simple_std(x, axis=0, ddof=ddof), 1e-8)


In [14]:
factors_data = pd.concat(alpha_res)
factors_data.tail()

Unnamed: 0,BM_MainFar_80D_23,BM_MainFar_80D_25,BM_MainFar_80D_27,BM_RecentFar_20D_5,BM_RecentFar_40D_3,BM_RecentFar_40D_9,BM_RecentFar_40D_11,BM_RecentFar_40D_13,BM_RecentFar_80D_9,BM_RecentFar_80D_11,...,TS_RecentSecond_7,T_DnIntraday_5D_23,T_DnVolatility_1_10D_21,T_DnVolatility_2_20D_25,T_DnVolatility_2_20D_27,WeightNetIntTotalChg5D_11,WeightNetIntTotalChg5D_13,WeightShortVolRelTotIntChg_9,trade_date,code
30,-1.476446,-1.582866,-1.660658,-1.378918,-0.761049,-1.548249,-1.49199,-1.322412,-1.561855,-1.49489,...,0.792496,0.279589,-0.568689,-0.184132,-0.234938,-0.12278,-0.360587,-0.252067,2022-06-22,TA
31,-0.087925,-0.140685,-0.174119,0.737069,-1.077857,-0.620455,-0.444036,-0.432019,-0.429449,-0.466354,...,0.458422,-0.978035,-1.142478,-0.939997,-0.987123,0.607201,0.203519,-0.531509,2022-06-22,V
32,-1.147232,-0.96806,-0.922699,-1.510556,-1.203488,-1.385261,-1.44778,-1.424192,-1.471564,-1.515301,...,-2.133296,-1.419082,-1.314467,-1.86896,-1.925835,0.901514,0.922691,0.496849,2022-06-22,Y
33,0.206059,0.167452,0.139033,0.447884,-0.253253,-0.519678,-0.608452,-0.489815,-1.865444,-1.926757,...,-0.999571,2.37413,0.355085,-0.581136,-0.640288,-1.046225,-0.186613,0.219181,2022-06-22,ZC
34,1.757122,1.660916,1.357403,0.818272,-0.948261,0.805499,0.920428,0.957197,0.541772,0.589358,...,0.057476,-0.667581,0.415297,1.77377,1.807674,-0.747155,-0.57908,0.876367,2022-06-22,ZN


构建模型X,Y值

##### 目前使用收益率为第二天的收益率为Y，来构建模型

In [15]:
def returns(market_data):
    price_tb = market_data['closePrice'].unstack()
    price_tb.fillna(method='pad', inplace=True)
    return_tb = np.log(price_tb.shift(-1) / price_tb)
    return_tb = return_tb.replace([np.inf, -np.inf], np.nan)
    return_tb = return_tb.stack().reindex(market_data.index)
    return_tb.name = 'nxt1_ret'
    return return_tb

In [16]:
rets = returns(market_data.set_index(['trade_date','code']))
rets.head()

trade_date  code
2017-10-27  A      -0.000276
            AL      0.001222
            BU      0.019945
            C      -0.005420
            CF      0.001003
Name: nxt1_ret, dtype: float64

In [17]:
train_data = factors_data.merge(rets, on=['trade_date','code'])
train_data = train_data.dropna(subset=['nxt1_ret']).fillna(0)
train_data.head()

Unnamed: 0,BM_MainFar_80D_23,BM_MainFar_80D_25,BM_MainFar_80D_27,BM_RecentFar_20D_5,BM_RecentFar_40D_3,BM_RecentFar_40D_9,BM_RecentFar_40D_11,BM_RecentFar_40D_13,BM_RecentFar_80D_9,BM_RecentFar_80D_11,...,T_DnIntraday_5D_23,T_DnVolatility_1_10D_21,T_DnVolatility_2_20D_25,T_DnVolatility_2_20D_27,WeightNetIntTotalChg5D_11,WeightNetIntTotalChg5D_13,WeightShortVolRelTotIntChg_9,trade_date,code,nxt1_ret
0,0.0,0.0,0.0,0.0,-0.331122,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-10-31,A,-0.000275
1,0.0,0.0,0.0,0.0,0.564609,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-10-31,AL,-0.007976
2,0.0,0.0,0.0,0.0,1.341401,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-10-31,BU,-0.011236
3,0.0,0.0,0.0,0.0,1.321591,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-10-31,C,0.002411
4,0.0,0.0,0.0,0.0,1.226883,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-10-31,CF,0.000334


In [18]:
columns = np.array(features)

为了到达不同模型不同特征目的，故在此处随机从60个基础特征中选择40个特征用于各个模型。实盘做法可参照GA挖掘机器学习模型特征及参数方式

##### 模型1   LinearRegression

In [19]:
feature = features[0:40]
data = train_data[feature]
linear_regression = LinearRegression(features=feature, fit_intercept=False)
linear_regression.fit(data, train_data['nxt1_ret'].values)
weights = linear_regression.weights
data = data * weights
linear_regression = LinearRegression(features=feature, fit_intercept=False)
linear_regression.fit(data, train_data['nxt1_ret'].values)

#### 模型2 LassoRegression

In [20]:
feature = features[10:50]
data = train_data[feature]
lasso_regression = LassoRegression(features=feature, fit_intercept=False)
lasso_regression.fit(data, train_data['nxt1_ret'].values)
weights = lasso_regression.weights
data = data * weights
lasso_regression = LassoRegression(features=feature, fit_intercept=False)
lasso_regression.fit(data, train_data['nxt1_ret'].values)

#### 模型3 BayesianRegression

In [21]:
feature = features[20:60]
data = train_data[feature]
bayesian_regression = BayesianRegression(features=feature, fit_intercept=False)
bayesian_regression.fit(data, train_data['nxt1_ret'].values)
weights = bayesian_regression.weights
data = data * weights
bayesian_regression = BayesianRegression(features=feature, fit_intercept=False)
bayesian_regression.fit(data, train_data['nxt1_ret'].values)

目前数据暂时训练集数据和数据集数据共用一套

#### 构建第二层模型X值

In [22]:
y = linear_regression.predict(train_data[linear_regression.features])
X1 = train_data[['trade_date','code']]
X1['X1'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
y = lasso_regression.predict(train_data[lasso_regression.features])
X2 = train_data[['trade_date','code']]
X2['X2'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
y = bayesian_regression.predict(train_data[bayesian_regression.features])
X3 = train_data[['trade_date','code']]
X3['X3'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


#### 合并数据集

In [25]:
X = pd.concat([X1.set_index(['trade_date','code']), X2.set_index(['trade_date','code']), 
           X3.set_index(['trade_date','code'])],axis=1).reset_index()
X.head()

Unnamed: 0,trade_date,code,X1,X2,X3
0,2017-10-31,A,12.082008,0.0,0.293122
1,2017-10-31,AL,-20.601501,0.0,-0.420751
2,2017-10-31,BU,-48.945184,0.0,0.765734
3,2017-10-31,C,-48.22235,0.0,-0.169471
4,2017-10-31,CF,-44.766655,0.0,0.410419


In [26]:
train_data = X.merge(rets.reset_index(), on=['trade_date','code'])
train_data.head()

Unnamed: 0,trade_date,code,X1,X2,X3,nxt1_ret
0,2017-10-31,A,12.082008,0.0,0.293122,-0.000275
1,2017-10-31,AL,-20.601501,0.0,-0.420751,-0.007976
2,2017-10-31,BU,-48.945184,0.0,0.765734,-0.011236
3,2017-10-31,C,-48.22235,0.0,-0.169471,0.002411
4,2017-10-31,CF,-44.766655,0.0,0.410419,0.000334


#### 构建第二层模型

此处选择LGBMRegressor模型， LGBMRegressor主观上分析适合小数据做回归

In [27]:
regressor = LGBMRegressor(features=['X1','X2','X3'])
regressor.fit(train_data, train_data['nxt1_ret'].values)

In [28]:
Y = regressor.predict(train_data[regressor.features])
factors_data = train_data[['trade_date','code']]
factors_data['factor'] = y
factors_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,trade_date,code,factor
0,2017-10-31,A,0.293122
1,2017-10-31,AL,-0.420751
2,2017-10-31,BU,0.765734
3,2017-10-31,C,-0.169471
4,2017-10-31,CF,0.410419


#### 构建因子加权组合

In [29]:
def _weighted(data, equal=1):
    if equal == 0:
        weighted = data[['factor']] / data[['factor']].sum()
        weighted['code'] = data['code'].values
        
    else:
        weighted =  1 / len(data)
        weighted = pd.DataFrame([weighted for i in range(0, len(data))],
                                columns=['factor'],
                                index=data.index)
    weighted['code'] = data['code'].values
    weighted = weighted.reset_index().drop(['trade_date'],axis=1)
    return weighted.rename(columns={'market_values':'weight'})

In [30]:
weighted = factors_data.set_index('trade_date').groupby(
    level=['trade_date']).apply(
    lambda x: _weighted(x))

In [31]:
weighted.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,factor,code
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-06-21,30,0.028571,TA
2022-06-21,31,0.028571,V
2022-06-21,32,0.028571,Y
2022-06-21,33,0.028571,ZC
2022-06-21,34,0.028571,ZN
