In [34]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score
import talib as ta


# 数据加载

In [35]:
# 原始数据
data_path = './data/pre/SH000001.csv'
df = pd.read_csv(data_path, index_col=0)
df

Unnamed: 0_level_0,symbol,close,high,low,open,pre_close,change,pct_chg,volume,amount,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1990-12-20,SH000001,104.3900,104.3900,99.9800,104.3000,99.9800,4.4100,4.4109,197,8.400000e+04,1
1990-12-21,SH000001,109.1300,109.1300,103.7300,109.0700,104.3900,4.7400,4.5407,28,1.600000e+04,1
1990-12-24,SH000001,114.5500,114.5500,109.1300,113.5700,109.1300,5.4200,4.9666,32,3.100000e+04,1
1990-12-25,SH000001,120.2500,120.2500,114.5500,120.0900,114.5500,5.7000,4.9760,15,6.000000e+03,1
1990-12-26,SH000001,125.2700,125.2700,120.2500,125.2700,120.2500,5.0200,4.1746,100,5.300000e+04,1
...,...,...,...,...,...,...,...,...,...,...,...
2021-01-04,SH000001,3502.9584,3511.6554,3457.2061,3474.6793,3473.0693,29.8891,0.8606,380790800,5.230000e+11,1
2021-01-05,SH000001,3528.6767,3528.6767,3484.7151,3492.1912,3502.9584,25.7183,0.7342,407995934,5.680000e+11,1
2021-01-06,SH000001,3550.8767,3556.8022,3513.1262,3530.9072,3528.6767,22.2000,0.6291,370230926,5.220000e+11,1
2021-01-07,SH000001,3576.2046,3576.2046,3526.6174,3552.9087,3550.8767,25.3279,0.7133,405348226,5.460000e+11,0


# 加入特征

In [36]:
df['ADX'] = ta.ADX(df['high'], df['low'], df['close'], timeperiod=14)
df['ADXR'] = ta.ADXR(df['high'], df['low'], df['close'], timeperiod=14)
df['AROONDOWN'], df['AROONUP'] = ta.AROON(df['high'], df['low'], timeperiod=14)
df['AROONOSC'] = ta.AROONOSC(df['high'], df['low'], timeperiod=14)
df['ULTOSC'] = ta.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
df['MA5'] = ta.MA(df['close'], timeperiod=5, matype=0)
df['MA10'] = ta.MA(df['close'], timeperiod=10, matype=0)
df['MA20'] = ta.MA(df['close'], timeperiod=20, matype=0)
df['MA30'] = ta.MA(df['close'], timeperiod=30, matype=0)
df['MA60'] = ta.MA(df['close'], timeperiod=60, matype=0)

df['K'] , df['D']= ta.STOCHF(df['high'], df['low'], df['close'], fastk_period=5, fastd_period=3, fastd_matype=0)

df['DIF'], df['DEM'], df['HISTOGRAM'] = ta.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)

# 布林线
df['UPPERBAND'], df['MIDDLEBAND'], df['LOWERBAND'] = ta.BBANDS(df['close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)

df['RSI6'] = ta.RSI(df['close'], timeperiod=6)
df['RSI12'] = ta.RSI(df['close'], timeperiod=12)
df['RSI24'] = ta.RSI(df['close'], timeperiod=24)

df['EMA'] = ta.EMA(df['close'], timeperiod=10)
df['NATR'] = ta.NATR(df['high'], df['low'], df['close'], timeperiod=14)
df['CDL3OUTSIDE'] = ta.CDL3OUTSIDE(df['open'], df['high'], df['low'], df['close'])
df['TRANGE'] = ta.TRANGE(df['high'], df['low'], df['close'])

# 形态模式
df['CDLBELTHOLD'] = ta.CDLBELTHOLD(df['open'], df['high'], df['low'], df['close'])
df['CDLCLOSINGMARUBOZU'] = ta.CDLCLOSINGMARUBOZU(df['open'], df['high'], df['low'], df['close'])


df

Unnamed: 0_level_0,symbol,close,high,low,open,pre_close,change,pct_chg,volume,amount,...,LOWERBAND,RSI6,RSI12,RSI24,EMA,NATR,CDL3OUTSIDE,TRANGE,CDLBELTHOLD,CDLCLOSINGMARUBOZU
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-12-20,SH000001,104.3900,104.3900,99.9800,104.3000,99.9800,4.4100,4.4109,197,8.400000e+04,...,,,,,,,0,,0,0
1990-12-21,SH000001,109.1300,109.1300,103.7300,109.0700,104.3900,4.7400,4.5407,28,1.600000e+04,...,,,,,,,0,5.4000,0,0
1990-12-24,SH000001,114.5500,114.5500,109.1300,113.5700,109.1300,5.4200,4.9666,32,3.100000e+04,...,,,,,,,0,5.4200,0,0
1990-12-25,SH000001,120.2500,120.2500,114.5500,120.0900,114.5500,5.7000,4.9760,15,6.000000e+03,...,,,,,,,0,5.7000,0,0
1990-12-26,SH000001,125.2700,125.2700,120.2500,125.2700,120.2500,5.0200,4.1746,100,5.300000e+04,...,99.754864,,,,,,0,5.0200,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-04,SH000001,3502.9584,3511.6554,3457.2061,3474.6793,3473.0693,29.8891,0.8606,380790800,5.230000e+11,...,3339.421255,77.932831,67.823903,61.407759,3423.254038,1.188239,0,54.4493,0,0
2021-01-05,SH000001,3528.6767,3528.6767,3484.7151,3492.1912,3502.9584,25.7183,0.7342,407995934,5.680000e+11,...,3348.781056,81.207047,70.496011,63.127443,3442.421795,1.184311,0,43.9616,0,100
2021-01-06,SH000001,3550.8767,3556.8022,3513.1262,3530.9072,3528.6767,22.2000,0.6291,370230926,5.220000e+11,...,3398.997066,83.710605,72.635944,64.550280,3462.140869,1.180700,0,43.6760,0,0
2021-01-07,SH000001,3576.2046,3576.2046,3526.6174,3552.9087,3550.8767,25.3279,0.7133,405348226,5.460000e+11,...,3454.365733,86.223278,74.901631,66.107278,3482.879729,1.187641,0,49.5872,0,0


In [37]:
# 数据集划分
train_start_time = '2016-01-01'
train_end_time = '2020-11-30'
valid_start_time = '2021-01-01'
valid_end_time = '2021-01-07'
test_start_time = '2020-12-01'
test_end_time = '2020-12-31'
train_df = df.loc[train_start_time:train_end_time]
valid_df = df.loc[valid_start_time:valid_end_time]
test_df = df.loc[test_start_time:test_end_time]
train_df

Unnamed: 0_level_0,symbol,close,high,low,open,pre_close,change,pct_chg,volume,amount,...,LOWERBAND,RSI6,RSI12,RSI24,EMA,NATR,CDL3OUTSIDE,TRANGE,CDLBELTHOLD,CDLCLOSINGMARUBOZU
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-04,SH000001,3296.2580,3538.6890,3295.7410,3536.5890,3539.1820,-242.9240,-6.8638,184418423,2.410000e+11,...,3294.181289,15.963421,28.672971,39.698774,3521.130646,2.276708,0,243.4410,-100,-100
2016-01-05,SH000001,3287.7110,3328.1390,3189.6050,3196.6510,3296.2580,-8.5470,-0.2593,266882083,3.280000e+11,...,3189.741880,15.567306,28.244298,39.389240,3478.690711,2.420560,0,138.5340,100,0
2016-01-06,SH000001,3361.8400,3362.9740,3288.9330,3291.1950,3287.7110,74.1290,2.2547,238886670,2.850000e+11,...,3169.225831,32.897056,37.136602,43.384316,3457.445127,2.358012,0,75.2630,100,100
2016-01-07,SH000001,3125.0020,3309.6570,3115.8850,3309.6570,3361.8400,-236.8380,-7.0449,70569123,7.998199e+10,...,3054.563606,18.410004,25.934715,35.568306,3397.000922,2.917709,0,245.9550,-100,0
2016-01-08,SH000001,3186.4120,3235.4510,3056.8780,3194.6250,3125.0020,61.4100,1.9651,286440822,3.240000e+11,...,3082.382228,28.242410,31.757370,38.563006,3358.712027,3.057387,0,178.5730,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-24,SH000001,3402.8225,3413.9263,3396.2436,3407.4087,3414.4899,-11.6674,-0.3417,300994144,3.600000e+11,...,3331.570064,70.947945,64.554994,58.410928,3365.134619,1.105950,0,18.2463,0,0
2020-11-25,SH000001,3362.3274,3423.4853,3362.3274,3417.5150,3402.8225,-40.4951,-1.1900,322110207,3.830000e+11,...,3341.850899,50.137992,54.712472,54.128854,3364.624216,1.169245,0,61.1579,0,-100
2020-11-26,SH000001,3369.7334,3371.4537,3344.2822,3360.0609,3362.3274,7.4060,0.2203,255886822,3.170000e+11,...,3345.555905,53.153568,56.049409,54.761747,3365.553158,1.140937,0,27.1715,0,0
2020-11-27,SH000001,3408.3071,3408.3071,3364.4919,3373.8434,3369.7334,38.5737,1.1447,280649894,3.260000e+11,...,3348.977563,66.003988,62.362591,57.917368,3373.326602,1.139275,0,43.8152,0,100


In [38]:
# 选择特征
all_features = ['close', 'high', 'low', 'open', 'pre_close', 'change', 'pct_chg', 'volume', 'amount', 'ADX', 'ADXR', 'AROONDOWN', 'AROONUP', 'AROONOSC', 'ULTOSC', 'MA5', 'MA10', 'MA20', 'MA30', 'MA60', 'K', 'D', 'DIF', 'DEM', 'HISTOGRAM', 'UPPERBAND', 'MIDDLEBAND', 'LOWERBAND', 'RSI6', 'RSI12', 'RSI24', 'EMA', 'NATR', 'CDL3OUTSIDE', 'TRANGE', 'CDLBELTHOLD', 'CDLCLOSINGMARUBOZU']

selected_features = ['close', 'high', 'low', 'open', 'pre_close', 'change', 'pct_chg', 'amount', 'ADX', 'ADXR', 'AROONDOWN', 'AROONOSC', 'ULTOSC', 'MA5', 'MA10', 'MA20', 'MA30', 'MA60', 'K', 'D', 'DEM', 'HISTOGRAM', 'UPPERBAND', 'MIDDLEBAND', 'RSI6', 'RSI12', 'RSI24', 'NATR', 'CDL3OUTSIDE', 'TRANGE', 'CDLBELTHOLD', 'CDLCLOSINGMARUBOZU']

# 选择全部特征或者筛选特征
features = selected_features

x_train = train_df[features].values
y_train = train_df['label'].values
x_valid = valid_df[features].values
y_valid = valid_df['label'].values
x_test = test_df[features].values
y_test = test_df['label'].values
x_train

array([[3296.258 , 3538.689 , 3295.741 , ...,  243.441 , -100.    ,
        -100.    ],
       [3287.711 , 3328.139 , 3189.605 , ...,  138.534 ,  100.    ,
           0.    ],
       [3361.84  , 3362.974 , 3288.933 , ...,   75.263 ,  100.    ,
         100.    ],
       ...,
       [3369.7334, 3371.4537, 3344.2822, ...,   27.1715,    0.    ,
           0.    ],
       [3408.3071, 3408.3071, 3364.4919, ...,   43.8152,    0.    ,
         100.    ],
       [3391.7551, 3456.7365, 3391.7551, ...,   64.9814,    0.    ,
        -100.    ]])

In [39]:
def model_report(y_true, y_pred, name=''):
    print(f'模型{name}')
    print('指标报告:')
    print(classification_report(y_true, y_pred))
    print(f'准确率: {accuracy_score(y_true, y_pred)}')


# 多模型训练

In [40]:
# logistic
from sklearn.linear_model import LogisticRegression
lrModel = LogisticRegression(penalty='l2')
lrModel.fit(x_train, y_train)
pred_lrModel = lrModel.predict(x_test)

model_report(y_test, pred_lrModel)

模型
指标报告:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.48      1.00      0.65        11

    accuracy                           0.48        23
   macro avg       0.24      0.50      0.32        23
weighted avg       0.23      0.48      0.31        23

准确率: 0.4782608695652174


In [41]:
# GBDT
from sklearn.ensemble import GradientBoostingClassifier
gbdtModel = GradientBoostingClassifier(n_estimators=200)
gbdtModel.fit(x_train, y_train)
pred_gbdtModel = gbdtModel.predict(x_test)

model_report(y_test, pred_gbdtModel, 'GBDT')

模型GBDT
指标报告:
              precision    recall  f1-score   support

           0       0.70      0.58      0.64        12
           1       0.62      0.73      0.67        11

    accuracy                           0.65        23
   macro avg       0.66      0.66      0.65        23
weighted avg       0.66      0.65      0.65        23

准确率: 0.6521739130434783


In [42]:
# SVC
from sklearn.svm import SVC
svmModel = SVC(kernel='rbf', probability=True)
svmModel.fit(x_train, y_train)
pred_svmModel = svmModel.predict(x_test)

model_report(y_test, pred_svmModel, 'SVC')

模型SVC
指标报告:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.48      1.00      0.65        11

    accuracy                           0.48        23
   macro avg       0.24      0.50      0.32        23
weighted avg       0.23      0.48      0.31        23

准确率: 0.4782608695652174


In [43]:
# QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qdaModel = QuadraticDiscriminantAnalysis()
qdaModel.fit(x_train, y_train)
pred_qdaModel = qdaModel.predict(x_test)
model_report(y_test, pred_qdaModel, 'QuadraticDiscriminantAnalysis')

模型QuadraticDiscriminantAnalysis
指标报告:
              precision    recall  f1-score   support

           0       0.80      0.67      0.73        12
           1       0.69      0.82      0.75        11

    accuracy                           0.74        23
   macro avg       0.75      0.74      0.74        23
weighted avg       0.75      0.74      0.74        23

准确率: 0.7391304347826086


In [44]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier
rfModel = RandomForestClassifier(n_estimators=10000)
rfModel.fit(x_train, y_train)
pred_rfModel = rfModel.predict(x_test)

model_report(y_test, pred_rfModel, 'RandomForest')

模型RandomForest
指标报告:
              precision    recall  f1-score   support

           0       0.67      0.50      0.57        12
           1       0.57      0.73      0.64        11

    accuracy                           0.61        23
   macro avg       0.62      0.61      0.61        23
weighted avg       0.62      0.61      0.60        23

准确率: 0.6086956521739131


In [45]:
# 特征重要性
importances = rfModel.feature_importances_
features_importances = [[feature, importance] for feature, importance in zip(features, importances)]
features_importances_df = pd.DataFrame(features_importances, columns=['feature', 'importance'])
features_importances_df = features_importances_df.sort_values(by='importance', ascending=False)
features_sorted = features_importances_df['feature'].values
features_sorted

array(['D', 'NATR', 'ULTOSC', 'TRANGE', 'K', 'pct_chg', 'change',
       'amount', 'ADX', 'DEM', 'ADXR', 'RSI6', 'HISTOGRAM', 'RSI24',
       'RSI12', 'MA60', 'MA30', 'MA20', 'pre_close', 'close', 'MA10',
       'UPPERBAND', 'open', 'low', 'high', 'MIDDLEBAND', 'MA5',
       'AROONOSC', 'AROONDOWN', 'CDLCLOSINGMARUBOZU', 'CDLBELTHOLD',
       'CDL3OUTSIDE'], dtype=object)

# 融合模型

In [46]:
# 融合模型
from sklearn.ensemble import VotingClassifier
clf1 = QuadraticDiscriminantAnalysis()
clf2 = RandomForestClassifier(n_estimators=10000, random_state=2021)
clf3 = GradientBoostingClassifier(n_estimators=200)

eclf = VotingClassifier(estimators=[('QDA', clf1), ('RF', clf2), ('GBDT', clf3)], voting='soft')
eclf.fit(x_train, y_train)


VotingClassifier(estimators=[('QDA', QuadraticDiscriminantAnalysis()),
                             ('RF',
                              RandomForestClassifier(n_estimators=10000,
                                                     random_state=2021)),
                             ('GBDT',
                              GradientBoostingClassifier(n_estimators=200))],
                 voting='soft')

In [47]:
# 预测
pred_eclf = eclf.predict(x_test)
model_report(y_test, pred_eclf, 'eclf')

模型eclf
指标报告:
              precision    recall  f1-score   support

           0       0.70      0.58      0.64        12
           1       0.62      0.73      0.67        11

    accuracy                           0.65        23
   macro avg       0.66      0.66      0.65        23
weighted avg       0.66      0.65      0.65        23

准确率: 0.6521739130434783
