在沙盒环境，将多个机器学习时序模型在集成学习框架下进行涨跌预测

In [1]:
import os,pdb,itertools,copy,datetime
os.environ['ULTRON_DATA'] = 'keim'

In [2]:
import pandas as pd
import numpy as np
from ultron.env import *
from ultron.optimize.model.treemodel import GradientBoostingClassifier
from ultron.optimize.model.treemodel import RandomForestClassifier
from ultron.optimize.model.treemodel import AdaBoostClassifier
from ultron.optimize.model.linearmodel import LogisticRegression

/var/log/ultron/2022-09-27.log


In [3]:
from ultron.optimize.model.treemodel import VotingClassifier
from ultron.optimize.model.treemodel import StackingClassifier

In [4]:
enable_example_env()

2022-09-27 19:19:57,778 - [env.py:67] - ultron - INFO - enable example env will only read /home/kerry/ultron/rom/sandbox/keim


#### 加载训练集

In [5]:
train_data = pd.read_csv(os.path.join(g_project_data, 'train_datas.csv'), index_col=0)
train_data.head()

Unnamed: 0,trade_date,code,BM_MainFar_80D,BM_RecentFar_20D,BM_RecentFar_40D,BM_RecentFar_80D,BM_RecentSecond_20D,BM_RecentSecond_40D,B_FarSpot,B_MainSpot,...,TS_RecentSecond,T_DnIntraday_5D,T_DnVolatility_1_10D,T_DnVolatility_2_20D,WeightNetIntTotalChg5D,WeightShortVolRelTotIntChg,inventory,profitratio,value,signal
0,2021-02-05,A,0.108297,0.028482,0.048874,0.112566,0.037459,0.018997,-0.115713,-0.633489,...,0.080587,-0.010746,-0.011118,-0.001603,0.000839,0.192546,,,1.130717,1.0
1,2021-02-05,AL,-0.017185,-0.010713,-0.028431,-0.043583,-0.015898,-0.022686,0.01055,-0.058663,...,-0.001708,-0.006366,-0.006237,6e-06,-0.000552,-0.095002,-71.400002,-0.126352,0.936497,1.0
2,2021-02-05,BU,0.002268,0.074625,0.144817,0.146292,0.060386,0.13083,-0.087043,-0.147928,...,-0.052928,-0.006808,-0.006575,0.003424,0.001066,0.126363,-61.389999,0.083217,0.250348,1.0
3,2021-02-05,C,0.021003,-0.003571,0.046976,0.034767,0.002104,0.030268,0.103967,0.17228,...,0.009364,-0.003704,-0.007573,-0.000428,-0.000842,-0.037971,-427.600006,,0.696008,1.0
4,2021-02-05,CF,0.013602,0.00397,0.006347,0.023968,-0.001802,0.002904,-0.024857,-0.031358,...,-0.034307,-0.00451,-0.007883,-0.000114,-0.000248,-0.029101,-618.409973,0.015356,0.971867,1.0


In [6]:
features = [col for col in train_data.columns if col not in ['trade_date','code','inventory','profitratio',
                                                         'value','signal']]

#### 构建训练集

In [7]:
X = train_data[features].fillna(0)
Y = train_data['signal'].values

#### 在实际过程中训练集和测试集要不同数据

#### 训练模型

In [8]:
m1 = RandomForestClassifier(features=features, n_estimators=20)
m1.fit(X, Y)
print(m1.score(X, Y))

0.9983548359165295


In [9]:
m2 = GradientBoostingClassifier(features=features)
m2.fit(X, Y)
print(m2.score(X, Y))

0.6252489393021041


In [10]:
m3 = AdaBoostClassifier(features=features)
m3.fit(X, Y)
print(m3.score(X, Y))

0.5824746731318728


#### 当单体模型训练好后，采用集成学习方式，一共有两种 1.StackingClassifier 2.VotingClassifier

##### VotingClassifier

In [11]:
ensemble1 = VotingClassifier(features=features,
    estimators=[('m1', m1.device), ('m2', m2.device),
                ('m3', m3.device)],voting='soft')
ensemble1.fit(X, Y)
print(ensemble1.score(X, Y))

0.9947181574162265


##### StackingClassifier

In [12]:
ensemble2 = StackingClassifier(
    features=features,
    classifiers=[m1.device, m2.device, m3.device],
    meta_classifier=LogisticRegression().device)
ensemble2.fit(X, Y)
print(ensemble2.score(X, Y))

0.9979218979998268


更多模型训练可参考 optimization_mining_model.ipynb 例子

##### 预测结果

##### 随机构造测试集 用于预测

In [13]:
test_data = pd.DataFrame(np.random.randn(1000, len(features)), columns=features)
test_data.head()

Unnamed: 0,BM_MainFar_80D,BM_RecentFar_20D,BM_RecentFar_40D,BM_RecentFar_80D,BM_RecentSecond_20D,BM_RecentSecond_40D,B_FarSpot,B_MainSpot,B_RecentSpot,B_SecondSpot,...,R_UpVolatility_1_40D,R_UpVolatility_1_60D,TS_MainFar,TS_RecentFar,TS_RecentSecond,T_DnIntraday_5D,T_DnVolatility_1_10D,T_DnVolatility_2_20D,WeightNetIntTotalChg5D,WeightShortVolRelTotIntChg
0,0.24584,0.067048,0.785072,1.300494,0.893289,0.451512,-0.018405,1.93418,-0.836682,-0.754481,...,-0.560768,0.555404,0.935174,0.708733,-0.325861,-1.72616,0.098658,-0.013302,0.28504,-0.130892
1,0.180911,0.982446,1.574092,-0.993717,0.818141,0.708741,-0.234551,-0.508018,-1.484246,-0.52128,...,0.33792,1.78714,0.83051,-0.105903,0.767895,-0.365703,-1.956811,1.408876,0.42068,-1.230145
2,0.772816,-1.975944,0.069778,-1.384341,1.720383,1.090461,0.641176,-0.570346,0.388064,-0.256326,...,-0.206377,-0.985002,-0.122053,1.099711,-0.224172,-1.006074,1.758399,0.014127,0.577504,0.858479
3,0.32756,1.940627,0.638636,-0.796469,0.172058,0.378696,0.748282,0.361551,-0.896843,-1.198303,...,0.799796,-0.459644,-0.141161,-0.328882,1.437545,-0.689705,0.809782,-0.721805,-1.00503,-0.883973
4,-1.422417,0.710785,0.248184,2.177396,-0.742559,-0.658575,0.684626,-0.315628,0.952025,0.854461,...,0.918378,-2.74092,-0.496655,0.014973,-1.0485,0.191986,0.523554,0.123838,2.209885,1.026315


In [14]:
y1 = ensemble1.predict(test_data)

In [15]:
y2 = ensemble2.predict(test_data)