# 初始化

In [1]:
# 初始化
import sys
import time
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# 数据读取
df_raw = pd.read_pickle('data_stock.pkl')
# df_raw = df_raw.iloc[:,:63]
df_raw.head()
# APE pricetoearning ADY股息率 TS 期限利差 DRS 违约利差

Unnamed: 0,date,code,E_RET,FAG,TAG,NPG,TPG,OPG,RG,SG,...,ATURNOVER,AVAR,ABM,RFR,TBR_1Y,TBR_5Y,TBR_10Y,TS,DRS,CPI
0,2004-01-31,1,0.088454,-0.085398,0.529672,-0.088587,-0.353633,-0.144732,-0.06698,0.649646,...,0.62298,0.023678,-1.187669,-0.708848,-0.950604,-0.793329,-0.036616,1.312954,-1.639785,0.547714
1,2004-01-31,2,0.198254,-0.462371,1.278572,0.611947,0.96645,0.584612,2.187683,1.123943,...,0.62298,0.023678,-1.187669,-0.708848,-0.950604,-0.793329,-0.036616,1.312954,-1.639785,0.547714
2,2004-01-31,6,0.089354,-0.133499,-1.131546,-0.488548,-0.694846,-0.133311,0.255228,0.319366,...,0.62298,0.023678,-1.187669,-0.708848,-0.950604,-0.793329,-0.036616,1.312954,-1.639785,0.547714
3,2004-01-31,9,0.177554,-0.121645,-0.213518,0.760117,-0.193851,-0.054944,0.035656,-0.267355,...,0.62298,0.023678,-1.187669,-0.708848,-0.950604,-0.793329,-0.036616,1.312954,-1.639785,0.547714
4,2004-01-31,12,0.244254,0.262967,0.632787,0.019302,0.012431,0.052023,-0.119654,-0.016191,...,0.62298,0.023678,-1.187669,-0.708848,-0.950604,-0.793329,-0.036616,1.312954,-1.639785,0.547714


In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def eval_class(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred) 
    prec = precision_score(y_test, y_pred) 
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    df_eval = pd.DataFrame({'Accuracy':acc, 'Precision':prec, 'Recall':recall, 'F1':f1}, index=[0])
    return df_eval

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error # R2, MSE, MAE, MAPE
def eval_result(y_real, y_pred): 
    y_real, y_pred = np.array(y_real).ravel(), np.array(y_pred).ravel()
    scale = np.max(y_real) - np.min(y_real) # scale is important for RMSE and MAE
    r2 = r2_score(y_real, y_pred)
    rmse = mean_squared_error(y_real, y_pred, squared=False) # RMSE and MAE are various on different scales
    mae = mean_absolute_error(y_real, y_pred)
    mape = mean_absolute_percentage_error(y_real, y_pred) # Note that dataset cannot have any 0 value.
    df_eval = pd.DataFrame({'Scale':scale, 'R2':r2, 'RMSE':rmse, 'MAE':mae, 'MAPE':mape}, index=[0])
    return df_eval

# 特征选择

In [3]:
# 划分测试集
df_all = df_raw.iloc[:,2:] # 删除编号
df_all.loc[df_all['E_RET']>0,'E_RET'] = 1
df_all.loc[df_all['E_RET']<=0,'E_RET'] = 0
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_all.iloc[:,1:], df_all.E_RET, test_size=0.1, random_state=0, shuffle=False) # 顺序分割
X_train.shape, X_test.shape

((339251, 72), (37695, 72))

In [4]:
# # 归一化
# from sklearn.preprocessing import MinMaxScaler # 为保证特征选择时，数据的一致性，进行归一化（针对方差选择等）
# scalarX = MinMaxScaler(feature_range=(0,1)) 
# X_train[X_train.columns] = scalarX.fit_transform(X_train)
# X_test[X_test.columns] = scalarX.transform(X_test)

In [5]:
# 挂载feature_selection包
sys.path.append('.\\pyFS\\')
from feature_selection import filter_method as ft

## 过滤式特征选择

In [33]:
# 1、方差特征选择 (标准化后方差为1, 无效)
# 作用：删除对大多数/所有观测值显示相同值的特征(常数/准常数特征)
from sklearn.feature_selection import VarianceThreshold 
var = VarianceThreshold(threshold=X_train.var().mean()) # 设置方差阈值 threshold = 所有特征方差的平均值
df = var.fit_transform(X_train)
selected_feature = var.get_feature_names_out() # 查看被留下特征的字符名称

In [26]:
# 2、卡方特征选择 （只是适用于离散特征）
# 作用：卡方检验主要是用来进行 分类变量（离散变量）的关联性、相关性分析
from sklearn.feature_selection import chi2

In [15]:
# 3、F检验特征选择 
# 作用：基于f检验的方法估计两个随机变量之间的线性依赖程度。只适用于非负性特征。
from sklearn.feature_selection import SelectKBest, f_classif # f_regression
selector = SelectKBest(f_classif, k=10)
selector.fit(X_train, y_train)
selected_feature = selector.get_feature_names_out() # selector.scores_
selected_feature

array(['POA', 'ROE', 'ROIC', 'LTROC', 'ATURNOVER', 'RFR', 'TBR_1Y',
       'TBR_5Y', 'TBR_10Y', 'CPI'], dtype=object)

In [24]:
# 4、Pearson系数 特征选择 
# 作用：Pearson系数绝对值最大的10个特征
df_train = df_all[:-len(df_all)//10]
df_corr = df_train.corr()[["E_RET"]].sort_values(by=["E_RET"])
df_corr_abs = df_corr.abs().sort_values(by=["E_RET"])
selected_feature = df_corr.T[df_corr_abs[-11:].index].T[::-1][1:].index
selected_feature

Index(['TBR_5Y', 'CPI', 'TBR_1Y', 'ATURNOVER', 'TBR_10Y', 'LTROC', 'RFR',
       'ROIC', 'POA', 'ROE'],
      dtype='object')

In [33]:
# 5、Mutual information
# 作用：互信息度量一个特征的存在/缺失对正确预测Y的贡献。
from sklearn.feature_selection import SelectKBest, mutual_info_regression
selector = SelectKBest(mutual_info_regression, k=10)
selector.fit(X_train, y_train)
selected_feature = selector.get_feature_names_out()
selected_feature

array(['APE', 'ADY', 'ATURNOVER', 'AVAR', 'RFR', 'TBR_1Y', 'TBR_5Y',
       'TBR_10Y', 'TS', 'DRS'], dtype=object)

### 因果特征选择

In [None]:
sys.path.append('.\\pyCausalFS\\')
from pyCausalFS import CBD # 使用 pyCausalFS
from pyCausalFS.CBD.example.example_MB import * #  注意需要在CBD/example文件夹中的example_MB函数 添加return MB 并修改file.open的位置

def show_features(MB):
    df_col = pd.DataFrame(df_raw.iloc[:,2:].columns) # 存贮列名
    selected_feature = df_col.T[MB].values
    return selected_feature

df_train, df_test, y_train, y_test = train_test_split(df_all, df_all.E_RET, test_size=0.1, random_state=0, shuffle=False) # 顺序分割
df_train.columns = [x for x in range(len(df_train.columns))] # 去除列名
df_train.shape # df_train包含target列月超额收益率E_RET

In [34]:
# 1、IPCMB
MB = example(method='IPCMB', data=df_train, target=[0], alpha=0.01, is_discrete=False) # 选择第一列作为target
show_features(MB)
# 全集选择 ['OC', 'TBR_5Y', 'CPI', 'ATCDPS', 'DM', 'ERR', 'PE', 'TQ', 'CINT', 'EVM', 'TURNOVER', 'ST_REVERSAL']
# 训练集选择 收益率转01 ['OC', 'TBR_5Y', 'CPI', 'DM', 'ERR', 'PE', 'FATI', 'CINT', 'EVM', 'TURNOVER', 'BETA_S', 'MOM_3M', 'MOM_6M']
# 训练集选择 ['OC', 'TBR_5Y', 'CPI', 'DM', 'ERR', 'PE', 'FATI', 'CINT', 'EVM', 'TURNOVER']

Index(['OC', 'TBR_5Y', 'CPI', 'ATCDPS', 'DM', 'ERR', 'PE', 'TQ', 'CINT', 'EVM',
       'TURNOVER', 'ST_REVERSAL'],
      dtype='object')

In [None]:
# 2、PCMB
MB = example(method='PCMB', data=df_train, target=[0], alpha=0.01, is_discrete=False) # 选择第一列作为target
show_features(MB)

## 封装式特征选择

## 嵌入式特征选择

In [47]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', C=0.5, solver='sag')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
eval_class(y_test, y_pred) 
# all 0.505054	0.506271	0.624116	0.559051

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.505001,0.506229,0.624011,0.558983


In [48]:
lr_coef = pd.DataFrame(lr.coef_)
lr_coef.columns = df_all.columns[1:]
lr_coef = lr_coef.T.sort_values(by=[0])
lr_coef = lr_coef.abs().sort_values(by=[0])[-10:][::-1]
selected_feature = lr_coef.index
selected_feature

Index(['TBR_5Y', 'TBR_10Y', 'ADY', 'APE', 'RFR', 'TBR_1Y', 'LT_REVERSAL',
       'CPI', 'ATURNOVER', 'MOM_6M'],
      dtype='object')

In [48]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train) 
y_pred = dt.predict(X_test)
eval_class(y_test, y_pred) 
# all 0.94261917716486
# mi 0.730389

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.502799,0.505923,0.468813,0.486661


## 集成式特征选择

# 对比各模型分类表现

## 分类

In [10]:
# 记录cfs运行结果
# IPCMB MMMB IAMB inter_IAMB IAMBnPC GSMB BAMB MBOR STMB LRH/LCMB
def cfs_feature(cfs):
    if cfs=='IPCMB': return ['OC', 'TBR_5Y', 'CPI', 'DM', 'ERR', 'PE', 'FATI', 'CINT', 'EVM', 'TURNOVER', 'BETA_S', 'MOM_3M', 'MOM_6M'] # IPCMB 100m 
    if cfs=='MMMB': return ['OC', 'TBR_5Y', 'ALR', 'CPI', 'DM', 'ERR', 'PE', 'MVTTA', 'FATI', 'CINT', 'EVM', 'TURNOVER', 'BETA_S', 'EBIT'] # MMMB 75m 
    if cfs=='IAMB': return ['ABM', 'OC', 'TBR_5Y', 'ALR', 'CPI', 'DM', 'ERR', 'FATI', 'MVTTA', 'TQ', 'CINT', 'EVM', 'BETA_S', 'ST_REVERSAL'] # IAMB 1m
    if cfs=='inter_IAMB': return ['TBR_5Y'] # inter_IAMB 8s
    # if cfs=='IAMBnPC': return ['DM', 'ERR', 'MVTTA', 'EVM', 'FATI', 'CINT', 'BETA_S', 'TBR_5Y', 'CPI'] # IAMBnPC 2m 
    if cfs=='GSMB': return ['ABM', 'OC', 'TBR_5Y', 'ALR', 'CPI', 'DM', 'ERR', 'MVTTA', 'FATI', 'TQ', 'CINT', 'EVM', 'BETA_S', 'ST_REVERSAL'] # GSMB 1m
    if cfs=='BAMB': return ['TBR_5Y', 'CPI', 'DM', 'ERR', 'FATI', 'MVTTA', 'TQ', 'CINT', 'EVM', 'BETA_S'] # BAMB 6m 
    if cfs=='MBOR': return ['OC', 'TBR_5Y', 'ALR', 'CPI', 'DM', 'ERR', 'PE', 'PS', 'PB', 'FATI', 'MVTTA', 'TQ', 'CINT', 'EVM', 'TURNOVER', 'BETA_S', 'EBIT', 'ST_REVERSAL'] # MBOR 128m
    if cfs=='STMB': return ['OC', 'TBR_5Y', 'CPI', 'DM', 'ERR', 'FATI', 'MVTTA', 'TQ', 'CINT', 'EVM', 'BETA_S', 'ST_REVERSAL'] # STMB 3m
    if cfs=='LCMB': return ['ABM', 'OC', 'TBR_5Y', 'ALR', 'CPI', 'DM', 'ERR', 'FATI', 'MVTTA', 'TQ', 'CINT', 'EVM', 'BETA_S', 'ST_REVERSAL'] # LRH 12m    
    if cfs=='TIE': return [] # TIE 17m   
    if cfs=='KIAMB': return ['ABM', 'OC', 'TBR_5Y', 'ALR', 'CPI', 'DM', 'ERR', 'FATI', 'MVTTA', 'TQ', 'CINT', 'EVM', 'BETA_S', 'ST_REVERSAL'] # KIAMB 1m  
    if cfs=='FBEDk': return ['TBR_5Y', 'CPI', 'DM', 'ERR', 'FATI', 'MVTTA', 'TQ', 'CINT', 'EVM', 'BETA_S'] # FBEDk 9s

In [5]:
selected_feature = cfs_feature(cfs='MBOR') # IPCMB MMMB IAMB inter_IAMB IAMBnPC GSMB BAMB MBOR STMB LCMB KIAMB FBEDk
# selected_feature = ['ST_REVERSAL']
X_train_selected, X_test_selected = X_train[selected_feature], X_test[selected_feature]
X_train_selected.head()

Unnamed: 0,OC,TBR_5Y,ALR,CPI,DM,ERR,PE,PS,PB,FATI,MVTTA,TQ,CINT,EVM,TURNOVER,BETA_S,EBIT,ST_REVERSAL
0,-0.106573,-0.793329,2.690755,0.547714,4.291879,0.311601,-0.138475,0.067215,0.37377,-0.171218,-1.005916,-0.684939,3.305135,0.227988,-0.762898,0.126015,0.864458,-0.57494
1,0.302482,-0.793329,0.574912,0.547714,0.719831,-0.207973,-0.300231,-0.448722,-0.500699,-0.553989,-0.613949,-0.478331,-0.308613,-0.285094,-0.473442,-0.502147,-0.025659,-0.073353
2,-0.058994,-0.793329,1.45228,0.547714,-0.34021,0.311601,-0.29067,-0.550378,-0.766247,-0.527272,-0.854049,-0.691054,-0.288026,0.071349,-0.133753,-0.104513,-0.14489,-0.60183
3,-0.019183,-0.793329,1.27949,0.547714,0.835539,0.311601,-0.210883,-0.189763,0.034569,-0.389905,-0.530074,-0.394632,-0.208324,-0.223464,-0.533321,1.131503,-0.102268,0.350307
4,-0.133537,-0.793329,-0.28982,0.547714,-0.489689,-2.05182,-0.269936,-0.13997,-0.368706,0.247612,-0.32043,-0.527791,-0.249045,-0.296841,-0.251305,-0.212079,-0.089145,-0.365854


### 逻辑回归

In [None]:
# 召回率（Recall）是衡量分类模型对正类（关注的类）样本识别能力的指标。它是在所有实际为正类的样本中，被模型正确识别为正类的样本比例。

In [6]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', C=0.5, solver='sag')
lr.fit(X_train_selected, y_train)
y_pred = lr.predict(X_test_selected)
df_test = pd.DataFrame(y_test)
df_test['pred'] = y_pred
eval_class(y_test, y_pred) 
# all 0.505001	0.506229	0.624011	0.558983
# cfs-y01 0.507017	0.505218	0.937573	0.656614 
# cfs-yc 0.507335	0.505323	0.949235	0.659541
# MMMB 0.497586	0.492696	0.861426	0.626858
# IPCMB(0.3test) 0.495083	0.491401	0.87574	0.629546
# f1 0.396976	0.375682	0.301478	0.334514
# pearson 0.396843	0.375477	0.301214	0.33427
# mi 0.5128	0.511796	0.66971	0.5802
# lr 0.482982	0.489077	0.636781	0.55324

# IPCMB 0.507335	0.505323	0.949235	0.659541
# MMMB 0.508211	0.505858	0.938681	0.657427
# IAMB 0.505319	0.504271	0.944011	0.657382
# inter_IAMB 0.502719	0.502719	1.0	0.669079
# IAMBnPC 0.501048	0.501997	0.941636	0.654874
# GSMB 0.505292	0.504257	0.943958	0.657357
# BAMB 0.500809	0.501879	0.937467	0.653762
# MBOR 0.510359	0.507055	0.934881	0.6575
# STMB 0.505611	0.504437	0.942005	0.657036
# LRH 0.505292	0.504257	0.943958	0.657357

# ST_REVERSAL 0	0.547685	0.574323	0.387388	0.462688
# ST_REVERSAL 0	0.547685	0.574323	0.387388	0.462688

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.510359,0.507055,0.934881,0.6575


### KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()   
knn.fit(X_train_selected, y_train) 
y_pred = knn.predict(X_test_selected)
eval_class(y_test, y_pred) 
# cfs 0.521528	0.518766	0.666649	0.583483
# cfs-y01 0.509564	0.509283	0.670185	0.578759
# cfs-yc 0.530707	0.526731	0.655092	0.583941
# f1  0.489534	0.494814	0.735145	0.5915
# pearson 0.489534	0.494814	0.735145	0.5915
# mi 0.48407	0.490963	0.713879	0.581799
# lr 0.506433	0.505805	0.79314	0.617692

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.507163,0.497473,0.589856,0.53974


### 朴素贝叶斯

In [14]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train_selected, y_train) 
y_pred = clf.predict(X_test_selected)
eval_class(y_test, y_pred) 
# cfs 0.515029	0.510163	0.886121	0.647527
# cfs-y01 0.510041	0.507276	0.884855	0.644861
# cfs-yc 0.510121	0.507153	0.905488	0.650159
# f1 0.485608	0.493917	0.942639	0.648197
# pearson 0.485608	0.493917	0.942639	0.648197
# mi 0.51601	0.512812	0.745594	0.607673
# lr 0.475819	0.488803	0.931821	0.641235

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.514684,0.510178,0.867599,0.642528


### 决策树

In [13]:
selected_feature = cfs_feature(cfs='FBEDk') # IPCMB MMMB IAMB inter_IAMB IAMBnPC GSMB BAMB MBOR STMB LCMB KIAMB FBEDk
X_train_selected, X_test_selected = X_train[selected_feature], X_test[selected_feature]

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train_selected, y_train) # X_train
y_pred = clf.predict(X_test_selected) # X_test 
eval_class(y_test, y_pred) 
# cfs 0.55835	0.544824	0.738259	0.626961
# cfs-y01 0	0.572914	0.55842	0.71905	0.628636
# cfs-yc 0.564452	0.54902	0.748232	0.63333
# f1 0.50126	0.503813	0.522955	0.513206
# pr 0.496432	0.499149	0.49504	    0.497086
# mi 0.532829	0.534737	0.544274	0.539463
# lr 0.505929	0.507494	0.582533	0.54243

# all 0.503701	0.506773	0.477731	0.491824
# MBOR 0.567051	0.570155	0.552127	0.767704	0.64231
# LRH 0.485184	0.489615	0.56723	0.525572
# STMB 0.557872	0.54424	0.741372	0.627692
# KIAMB 0.479904	0.485012	0.559261	0.519497
# FBEDk 0.551001	0.539723	0.725963	0.619141

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.551001,0.539723,0.725963,0.619141


### SVM

In [42]:
from sklearn.svm import SVC
import optuna

# def objective(trial):
#     # kernel = trial.suggest_categorical('kernel', ['linear','rbf','poly','sigmoid'])
#     gamma = trial.suggest_loguniform('gamma',1e-5,1e5)
#     C = trial.suggest_loguniform('C',1e-5,1e5)
#     epsilon = trial.suggest_loguniform('epsilon',1e-5,1e5)
#     model = SVC(kernel='rbf', gamma=gamma, C=C, epsilon=epsilon).fit(x_train, y_train)
#     score = cross_val_score(model, x_train, y_train, cv=5, scoring=accuracy_score)
#     return score.mean() 
# study = optuna.create_study(study_name='SVR C gamma epsilon', direction='maximize') # TPESampler is used
# # optuna.logging.set_verbosity(optuna.logging.WARNING) # not to print
# study.optimize(objective, n_trials=100, n_jobs=-1, gc_after_trial=True)  # number of iterations
# # best_kernel = study.best_params['kernel']
# best_gamma = study.best_params['gamma']
# best_C = study.best_params['C']
# best_epsilon = study.best_params['epsilon']

# Predict
# model = SVC(kernel='rbf', gamma=best_gamma, C=best_C, epsilon=best_epsilon)
# model = SVC(kernel='rbf', gamma=0.01, C=100)
model = SVC(kernel='linear')
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
eval_class(y_test, y_pred) 
# all 

### GBDT

In [None]:
# GBDT调参
# 设置迭代次数的范围
param_test1 = {'n_estimators': range(20, 251, 10)}
estimator = GradientBoostingClassifier(learning_rate=0.2, min_samples_split=50, min_samples_leaf=5, max_depth=8,
                                       max_features='sqrt', subsample=0.8, random_state=10)
gsearch1 = GridSearchCV(estimator, param_grid=param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch1.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import tree 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import optuna
def objective(trial):
    param_grid = {
        'metric': 'rmse', 
        'random_state': 48,
        'n_estimators': 20000,
        'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5]),
        'num_leaves': trial.suggest_categorical('num_leaves', [5, 6, 7, 12, 13, 14, 15, 28, 29, 30, 31]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0]),
    }
    model = GradientBoostingClassifier(**param_grid)
    model.fit(X_train_selected, y_train, eval_set=[(X_test_selected, y_test)], early_stopping_rounds=100, verbose=False)
    score = cross_val_score(model, X_train_selected, y_train, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
                            
    return score.mean() 
study = optuna.create_study(study_name='GBDT Classifier hyperparameters', direction='maximize') # TPESampler is used
optuna.logging.set_verbosity(optuna.logging.WARNING) # not to print
study.optimize(objective, n_trials=100, n_jobs=-1, gc_after_trial=True)  # number of iterations
best_params = study.best_params

# Predict
gbdt = GradientBoostingClassifier(random_state=110)
gbdt.fit(X_train_selected, y_train)
y_pred = gbdt.predict(X_test_selected)
eval_class(y_pred, y_test)

model = lgb.LGBMRegressor(**best_params)
model.fit(X_train_selected, y_train, eval_set=[(X_test_selected, y_test)], early_stopping_rounds=100, verbose=False)
y_pred = model.predict(X_test_selected)
eval_class(y_pred, y_test)

#### GRU

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, GRU
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

def GRU_model(trainset_shape):# Build GRU model
    model = Sequential()
    model.add(GRU(128, input_shape=((1,72)), activation='softmax', return_sequences=True)) # input_shape=((timestep, features))
    model.add(Dropout(0.2))
    model.add(GRU(64,activation='softmax',return_sequences=True))
    model.add(Dropout(0.2))
    model.add(GRU(32,activation='softmax',return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return model

Epochs = 100
predict_duration = 100

train_X =  X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1])) # Convert to tensor 
test_X = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))# Convert to tensor 

model = GRU_model(train_X.shape) # Build the model # Use model.summary() to show the model structure
patience = Epochs//10 
EarlyStop = EarlyStopping(monitor='val_loss', patience=5*patience, verbose=0, mode='auto') # Early stop at small learning rate
Reduce = ReduceLROnPlateau(monitor='val_loss', patience=patience, verbose=0, mode='auto') # Adaptive learning rate
history = model.fit(train_X, y_train, epochs=Epochs, batch_size=16, validation_split=0.1, verbose=1, shuffle=True, callbacks=[EarlyStop,Reduce]) # Train the model

y_pred = model.predict(X_test) # Predict
eval_class(y_test, y_pred) # Evaluate model

## 预测

In [49]:
selected_feature

Index(['TBR_5Y', 'TBR_10Y', 'ADY', 'APE', 'RFR', 'TBR_1Y', 'LT_REVERSAL',
       'CPI', 'ATURNOVER', 'MOM_6M'],
      dtype='object')

In [50]:
# 划分测试集
df_all = df_raw.iloc[:,2:] # 删除编号
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_all.iloc[:,1:], df_all.E_RET, test_size=0.1, random_state=0, shuffle=False)

# 归一化
from sklearn.preprocessing import MinMaxScaler # 为保证特征选择时，数据的一致性，进行归一化（针对方差选择等）
scalarX = MinMaxScaler(feature_range=(0,1)) 
X_train[X_train.columns] = scalarX.fit_transform(X_train)
X_test[X_test.columns] = scalarX.transform(X_test)
scalarY = MinMaxScaler(feature_range=(0,1)) 
y_train, y_test = y_train.to_frame(), y_test.to_frame()
y_train[y_train.columns] = scalarY.fit_transform(y_train)
y_test[y_test.columns] = scalarY.transform(y_test)
X_train.shape, X_test.shape

((339251, 72), (37695, 72))

In [13]:
X_train_selected = X_train[selected_feature]
X_test_selected = X_test[selected_feature]
X_train_selected.head()

Unnamed: 0,TBR_5Y,RG,APE,LT_REVERSAL,ADY,TURNOVER,RFR,TBR_10Y,CINT,ROA
151011,0.534817,0.02369,0.130552,0.284517,0.577787,0.041633,0.757655,0.293234,0.001122,0.4833
183939,0.293179,0.031255,0.158738,0.404748,0.569578,0.111403,0.342511,0.072934,0.010355,0.499544
52013,0.250478,0.028793,0.486768,0.393565,0.361647,0.052976,0.035129,0.250473,0.002883,0.486869
230655,0.617935,0.02445,0.115392,0.273371,0.622685,0.046,0.600522,0.383359,0.009035,0.469495
274364,0.458366,0.025456,0.090385,0.369823,0.72565,0.035918,0.324235,0.215093,0.001214,0.613144


In [52]:
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import cross_val_score, GridSearchCV
import optuna

def objective(trial):
    alpha = trial.suggest_float('alpha', 0, 1) 
    model = Lasso(alpha=alpha).fit(X_train, y_train)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return score.mean() 
study = optuna.create_study(study_name='LASSO alpha', direction='maximize') # TPESampler is used
optuna.logging.set_verbosity(optuna.logging.WARNING) # not to print
study.optimize(objective, n_trials=100, n_jobs=-1, gc_after_trial=True)  # number of iterations
best_alpha = study.best_params['alpha']

model = Lasso(alpha=best_alpha)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
eval_result(y_test, y_pred) 
# all 3.797127	0.035358	0.142767	0.10041	2.114145
# var 3.797127	-1.441297e-07	0.145359	0.102538	1.84601

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2024-06-21 22:11:15,282][0m A new study created in memory with name: LASSO alpha[0m


Unnamed: 0,Scale,R2,RMSE,MAE,MAPE
0,0.667071,-0.001459,0.030922,0.022084,0.152329


In [54]:
from sklearn.linear_model import Ridge

# 创建岭回归模型实例
ridge = Ridge(alpha=1.0)

def objective(trial):
    alpha = trial.suggest_float('alpha', 0, 1) 
    model = Lasso(alpha=alpha).fit(X_train, y_train)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return score.mean() 
study = optuna.create_study(study_name='Ridge alpha', direction='maximize') # TPESampler is used
optuna.logging.set_verbosity(optuna.logging.WARNING) # not to print
study.optimize(objective, n_trials=100, n_jobs=-1, gc_after_trial=True)  # number of iterations
best_alpha = study.best_params['alpha']

model = Ridge(alpha=best_alpha)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
eval_result(y_test, y_pred) 
# all 3.797127	0.045885	0.141985	0.099644	2.411082
# lr 0.796758	0.045913	0.029793	0.020908	0.143407



Unnamed: 0,Scale,R2,RMSE,MAE,MAPE
0,0.667071,-0.035826,0.031448,0.022623,0.158407


In [56]:
c = pd.DataFrame(model.coef_)
c.columns = df_all.columns[1:]
c = c.T.sort_values(by=[0])
c = c.abs().sort_values(by=[0])[-10:][::-1]
a = c.index
a

Index(['TBR_5Y', 'RG', 'ADY', 'ATURNOVER', 'CINT', 'ITAX', 'TBR_10Y', 'APE',
       'FATI', 'CATI'],
      dtype='object')

#### 神经网络环境检测

In [15]:
# Check tensorflow GPU CUDA
import tensorflow as tf
print(tf.test.gpu_device_name())
print('GPU', tf.config.list_physical_devices('GPU'))
print('CUDA', tf.test.is_built_with_cuda())

/device:GPU:0
GPU [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
CUDA True


#### GRU

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, GRU
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

def GRU_model(trainset_shape):# Build GRU model
    model = Sequential()
    model.add(GRU(128, input_shape=((1,72)), activation='tanh', return_sequences=True)) # input_shape=((timestep, features))
    model.add(Dropout(0.2))
    model.add(GRU(64,activation='tanh',return_sequences=True))
    model.add(Dropout(0.2))
    model.add(GRU(32,activation='tanh',return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='tanh'))
    model.compile(loss='mse', optimizer='adam')
    return model

Epochs = 100
predict_duration = 100

train_X =  X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1])) # Convert to tensor 
test_X = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))# Convert to tensor 

model = GRU_model(train_X.shape) # Build the model # Use model.summary() to show the model structure
patience = Epochs//10 
EarlyStop = EarlyStopping(monitor='val_loss', patience=5*patience, verbose=0, mode='auto') # Early stop at small learning rate
Reduce = ReduceLROnPlateau(monitor='val_loss', patience=patience, verbose=0, mode='auto') # Adaptive learning rate
history = model.fit(train_X, y_train, epochs=Epochs, batch_size=16, validation_split=0.1, verbose=1, shuffle=True, callbacks=[EarlyStop,Reduce]) # Train the model

y_pred = model.predict(X_test) # Predict
eval_result(y_test, y_pred) # Evaluate model

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

ValueError: in user code:

    File "c:\Software\Anaconda\envs\GPU\lib\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "c:\Software\Anaconda\envs\GPU\lib\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Software\Anaconda\envs\GPU\lib\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Software\Anaconda\envs\GPU\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "c:\Software\Anaconda\envs\GPU\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Software\Anaconda\envs\GPU\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 1, 72), found shape=(None, 72)
