In [1]:
import os
import pandas as pd
import numpy as np
from hyperopt import hp, fmin, rand, tpe, space_eval,Trials,STATUS_OK
from sklearn.model_selection import cross_val_score
from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
factors = ['Beta60', 'OperatingRevenueGrowRate', 'NetProfitGrowRate', 'NetCashFlowGrowRate', 'NetProfitGrowRate5Y', 'TVSTD20',
           'TVSTD6', 'TVMA20', 'TVMA6', 'BLEV', 'MLEV', 'CashToCurrentLiability', 'CurrentRatio', 'REC', 'DAREC', 'GREC',
           'DASREV', 'SFY12P', 'LCAP', 'ASSI', 'LFLO', 'TA2EV', 'PEG5Y', 'PE', 'PB', 'PS', 'SalesCostRatio', 'PCF', 'CETOP',
           'TotalProfitGrowRate', 'CTOP', 'MACD', 'DEA', 'DIFF', 'RSI', 'PSY', 'BIAS10', 'ROE', 'ROA', 'ROA5', 'ROE5',
           'DEGM', 'GrossIncomeRatio', 'ROECut', 'NIAPCut', 'CurrentAssetsTRate', 'FixedAssetsTRate', 'FCFF', 'FCFE', 'PLRC6',
           'REVS5', 'REVS10', 'REVS20', 'REVS60', 'HSIGMA', 'HsigmaCNE5', 'ChaikinOscillator', 'ChaikinVolatility', 'Aroon',
           'DDI', 'MTM', 'MTMMA', 'VOL10', 'VOL20', 'VOL5', 'VOL60', 'RealizedVolatility', 'DASTD', 'DDNSR', 'Hurst']

df = pd.read_csv('dataset_factorRank10.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ticker,tradeDate,Beta60,OperatingRevenueGrowRate,NetProfitGrowRate,NetCashFlowGrowRate,NetProfitGrowRate5Y,TVSTD20,...,RealizedVolatility,DASTD,DDNSR,Hurst,next_month_end,abs_return,active_return,industryName1,label,year
0,0,0,1,20070131,9,8,1,8,6,3,...,10,8,10,10,20070228,-0.004221,-0.071021,银行,-1,2007
1,1,1,2,20070131,3,3,5,5,9,2,...,10,3,9,10,20070228,-0.037359,-0.104159,房地产,-1,2007
2,2,2,60,20070131,1,8,1,4,1,1,...,10,2,3,10,20070228,0.171481,0.104681,有色金属,0,2007
3,3,3,63,20070131,7,9,9,1,5,1,...,10,8,8,10,20070228,0.093903,0.027103,通信,0,2007
4,4,4,69,20070131,9,5,9,4,3,1,...,10,3,3,10,20070228,0.020656,-0.046144,房地产,-1,2007


In [3]:
i = 1 #第一个训练集 
tdate = np.unique(df['tradeDate'].values)# 共计130个月
mark1 = tdate[72*i]
mark2 = tdate[72*i +3]
train_df = df[df['tradeDate']<mark1]
test_df = df[(df['tradeDate']>mark1) & (df['tradeDate']<mark2)]
# np.unique(test['tradeDate'].values) 测试过train和test的月份个数分别是72，2

train_x = train_df[factors]
train_y = train_df['active_return']
test_x = test_df[factors]
test_y = test_df['active_return']

dev_df = df[df['tradeDate']==mark2]
dev_x = dev_df[factors]

In [4]:
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
def my_custom_score(ground_truth,prediction):
    return r2_score(ground_truth,prediction)

my_score = make_scorer(my_custom_score,greater_is_better =True)

In [5]:
def obj(params,x=train_x,y=train_y):
    lgb = LGBMRegressor(max_depth = params['max_depth'],
                         learning_rate = params['learning_rate'],
                         n_estimators = params['n_estimators'],
                         min_child_weight = params['min_child_weight'],
                         gamma = params['gamma'],
                         subsample = params['subsample'],
                         colsample_bytree = params['colsample_bytree'],
                         reg_alpha = params['reg_alpha'],
                         #reg_lambda = params['reg_lambda'],
                         n_jobs = -1,
                         verbose = 1,
                         min_gain_to_split = params['min_gain_to_split'])
    lgb.fit(x,y,eval_metric=['l2'], verbose =0, eval_set = [(test_x, test_y)],early_stopping_rounds=100)

    print ('best score:',lgb.best_score_)
    score = lgb.best_score_['valid_0']['l2']
    return {'loss': score,'status':STATUS_OK}

space = {
    'max_depth': hp.choice('max_depth',list(range(2,15))),
    'learning_rate':hp.choice('learning_rate',list(np.arange(0.01,0.5,1))),
    'n_estimators':hp.choice('n_estimators',list(range(30,500))),
    'min_child_weight':hp.choice('min_child_weight',list(range(2,20))), #建立每个模型所需要的最小样本数
    'gamma':hp.choice('gamma',list(np.arange(0.01,1,0.05))),                       #进一步划分所需的最小损失减少
    'subsample':hp.choice('subsample',list(np.arange(0.5,0.9,0.05))),                #用于训练模型的子样本占整个样本集合的比例
    'colsample_bytree':hp.choice('colsample_bytree',list(np.arange(0.2,0.9,0.05))), #在建立树时对特征采样的比例
    'reg_alpha':hp.choice('reg_alpha',list(np.arange(0.1,1,0.1))),   #L1 正则的惩罚系数
    #'reg_lambda':hp.choice('reg_lambda',list(np.arange(0.1,1,0.5))), #L2 正则的惩罚系数
    'min_gain_to_split':hp.choice('reg_lambda',list(np.arange(0.1,1,0.1)))
}
trials = Trials()
best = fmin(obj,space,algo=tpe.suggest,max_evals = 100,trials = trials)
print(space_eval(space,best))

best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099197193263482857}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099319510627835991}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099338444023068066}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099257192385117949}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099354576867013138}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099269236946804302}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099249074986417947}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099257370254331283}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099157051073948746}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099300757031663064}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099351520446679161}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.009933588775820

best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.009915720757628985}})
best score: defaultdict(<class 'dict'>, {'valid_0': {'l2': 0.0099334199745929989}})
{'colsample_bytree': 0.49999999999999994, 'gamma': 0.11, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 4, 'min_gain_to_split': 0.10000000000000001, 'n_estimators': 266, 'reg_alpha': 0.30000000000000004, 'subsample': 0.75000000000000022}


In [None]:
##二分类最优参数
{'colsample_bytree': 0.49999999999999994, 
    'gamma': 0.36000000000000004, 'learning_rate': 0.01, 'max_depth': 7,
                'min_child_weight': 14, 'min_gain_to_split': 0.90000000000000002, 
                        'n_estimators': 425, 'reg_alpha': 0.80000000000000004, 'subsample': 0.55000000000000004}

In [None]:
# 回归最优参数
{'colsample_bytree': 0.49999999999999994, 
    'gamma': 0.11, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 4,
                    'min_gain_to_split': 0.10000000000000001, 
                        'n_estimators': 266, 'reg_alpha': 0.30000000000000004, 'subsample': 0.75000000000000022}