In [1]:
import os
import pandas as pd
import numpy as np
from hyperopt import hp, fmin, rand, tpe, space_eval,Trials,STATUS_OK
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
factors = ['Beta60', 'OperatingRevenueGrowRate', 'NetProfitGrowRate', 'NetCashFlowGrowRate', 'NetProfitGrowRate5Y', 'TVSTD20',
           'TVSTD6', 'TVMA20', 'TVMA6', 'BLEV', 'MLEV', 'CashToCurrentLiability', 'CurrentRatio', 'REC', 'DAREC', 'GREC',
           'DASREV', 'SFY12P', 'LCAP', 'ASSI', 'LFLO', 'TA2EV', 'PEG5Y', 'PE', 'PB', 'PS', 'SalesCostRatio', 'PCF', 'CETOP',
           'TotalProfitGrowRate', 'CTOP', 'MACD', 'DEA', 'DIFF', 'RSI', 'PSY', 'BIAS10', 'ROE', 'ROA', 'ROA5', 'ROE5',
           'DEGM', 'GrossIncomeRatio', 'ROECut', 'NIAPCut', 'CurrentAssetsTRate', 'FixedAssetsTRate', 'FCFF', 'FCFE', 'PLRC6',
           'REVS5', 'REVS10', 'REVS20', 'REVS60', 'HSIGMA', 'HsigmaCNE5', 'ChaikinOscillator', 'ChaikinVolatility', 'Aroon',
           'DDI', 'MTM', 'MTMMA', 'VOL10', 'VOL20', 'VOL5', 'VOL60', 'RealizedVolatility', 'DASTD', 'DDNSR', 'Hurst']

df = pd.read_csv('dataset_factorRank10.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ticker,tradeDate,Beta60,OperatingRevenueGrowRate,NetProfitGrowRate,NetCashFlowGrowRate,NetProfitGrowRate5Y,TVSTD20,...,RealizedVolatility,DASTD,DDNSR,Hurst,next_month_end,abs_return,active_return,industryName1,label,year
0,0,0,1,20070131,9,8,1,8,6,3,...,10,8,10,10,20070228,-0.004221,-0.071021,银行,-1,2007
1,1,1,2,20070131,3,3,5,5,9,2,...,10,3,9,10,20070228,-0.037359,-0.104159,房地产,-1,2007
2,2,2,60,20070131,1,8,1,4,1,1,...,10,2,3,10,20070228,0.171481,0.104681,有色金属,0,2007
3,3,3,63,20070131,7,9,9,1,5,1,...,10,8,8,10,20070228,0.093903,0.027103,通信,0,2007
4,4,4,69,20070131,9,5,9,4,3,1,...,10,3,3,10,20070228,0.020656,-0.046144,房地产,-1,2007


In [3]:
i = 1 #第一个训练集 
tdate = np.unique(df['tradeDate'].values)# 共计130个月
mark1 = tdate[72*i]
mark2 = tdate[72*i +3]
train_df = df[df['tradeDate']<mark1]
train_df = train_df[train_df['label']!=0]
train_df.loc[:,'label'] = train_df.loc[:,'label'].apply(lambda x:0 if x ==-1 else x)
test_df = df[(df['tradeDate']>mark1) & (df['tradeDate']<mark2)]
test_df = test_df[test_df['label']!=0]
test_df.loc[:,'label'] = test_df.loc[:,'label'].apply(lambda x:0 if x ==-1 else x)
# np.unique(test['tradeDate'].values) 测试过train和test的月份个数分别是72，2

train_x = train_df[factors]
train_y = train_df['label']
test_x = test_df[factors]
test_y = test_df['label']

dev_df = df[df['tradeDate']==mark2]
dev_x = dev_df[factors]

In [4]:
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
def my_custom_score(ground_truth,prediction):
    return roc_auc_score(ground_truth,prediction)

my_score = make_scorer(my_custom_score,greater_is_better =True)

In [91]:
tdate[74]

20130329

In [5]:
def obj(params,x=train_x,y=train_y):
    xgb = XGBClassifier(max_depth = params['max_depth'],learning_rate = params['learning_rate'],\
                       n_estimators = params['n_estimators'],min_child_weight = params['min_child_weight'], \
                       gamma = params['gamma'],subsample = params['subsample'],\
                       colsample_bytree = params['colsample_bytree'],reg_alpha = params['reg_alpha'],\
                       reg_lambda = params['reg_lambda'],n_jobs = -1,verbose = 0)
    scores = cross_val_score(xgb,x,y,cv = 3,scoring = my_score )
    
    print('params:',params)
    print('scores:',scores)
    print('scores_average:',np.average(scores))
    
    return {'loss': -np.average(scores),'status':STATUS_OK}

space = {
    'max_depth': hp.choice('max_depth',list(range(2,15))),
    'learning_rate':hp.choice('learning_rate',list(np.arange(0.01,0.5,10))),
    'n_estimators':hp.choice('n_estimators',list(range(30,500))),
    'min_child_weight':hp.choice('min_child_weight',list(range(2,20))), #建立每个模型所需要的最小样本数
    'gamma':hp.choice('gamma',list(np.arange(0.01,1,0.05))),                       #进一步划分所需的最小损失减少
    'subsample':hp.choice('subsample',list(np.arange(0.5,0.9,0.05))),                #用于训练模型的子样本占整个样本集合的比例
    'colsample_bytree':hp.choice('colsample_bytree',list(np.arange(0.2,0.9,0.05))), #在建立树时对特征采样的比例
    'reg_alpha':hp.choice('reg_alpha',list(np.arange(0.1,1,0.05))),   #L1 正则的惩罚系数
    'reg_lambda':hp.choice('reg_lambda',list(np.arange(0.1,1,0.05))) #L2 正则的惩罚系数
}
trials = Trials()
best = fmin(obj,space,algo=tpe.suggest,max_evals = 100,trials = trials)
print(space_eval(space,best))

params: {'colsample_bytree': 0.84999999999999987, 'gamma': 0.96000000000000008, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 17, 'n_estimators': 263, 'reg_alpha': 0.25000000000000006, 'reg_lambda': 0.35000000000000009, 'subsample': 0.60000000000000009}
scores: [ 0.5504058   0.5667608   0.54676189]
scores_average: 0.554642830872
params: {'colsample_bytree': 0.34999999999999998, 'gamma': 0.51000000000000001, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 4, 'n_estimators': 232, 'reg_alpha': 0.80000000000000016, 'reg_lambda': 0.15000000000000002, 'subsample': 0.55000000000000004}
scores: [ 0.54980637  0.55883552  0.54180572]
scores_average: 0.550149202378
params: {'colsample_bytree': 0.54999999999999993, 'gamma': 0.56000000000000005, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 6, 'n_estimators': 362, 'reg_alpha': 0.95000000000000029, 'reg_lambda': 0.50000000000000011, 'subsample': 0.80000000000000027}
scores: [ 0.54686623  0.5652185   0.54322165]


params: {'colsample_bytree': 0.84999999999999987, 'gamma': 0.81000000000000005, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 7, 'n_estimators': 110, 'reg_alpha': 0.75000000000000022, 'reg_lambda': 0.80000000000000016, 'subsample': 0.60000000000000009}
scores: [ 0.54733481  0.55588766  0.54238823]
scores_average: 0.548536898924
params: {'colsample_bytree': 0.34999999999999998, 'gamma': 0.46000000000000002, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 263, 'reg_alpha': 0.55000000000000016, 'reg_lambda': 0.6000000000000002, 'subsample': 0.65000000000000013}
scores: [ 0.54851913  0.56416596  0.5388515 ]
scores_average: 0.55051219632
params: {'colsample_bytree': 0.29999999999999999, 'gamma': 0.71000000000000008, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 11, 'n_estimators': 449, 'reg_alpha': 0.25000000000000006, 'reg_lambda': 0.35000000000000009, 'subsample': 0.80000000000000027}
scores: [ 0.55122631  0.56841609  0.54003392]
sc

params: {'colsample_bytree': 0.74999999999999978, 'gamma': 0.76000000000000001, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 16, 'n_estimators': 52, 'reg_alpha': 0.80000000000000016, 'reg_lambda': 0.20000000000000004, 'subsample': 0.75000000000000022}
scores: [ 0.54946865  0.55872856  0.5420327 ]
scores_average: 0.550076635952
params: {'colsample_bytree': 0.64999999999999991, 'gamma': 0.31000000000000005, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 17, 'n_estimators': 120, 'reg_alpha': 0.95000000000000029, 'reg_lambda': 0.55000000000000016, 'subsample': 0.80000000000000027}
scores: [ 0.55124154  0.56547292  0.54191937]
scores_average: 0.552877943322
params: {'colsample_bytree': 0.20000000000000001, 'gamma': 0.6100000000000001, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 441, 'reg_alpha': 0.70000000000000018, 'reg_lambda': 0.10000000000000001, 'subsample': 0.65000000000000013}
scores: [ 0.55145993  0.56723066  0.54156351]
sc

params: {'colsample_bytree': 0.59999999999999987, 'gamma': 0.26000000000000001, 'learning_rate': 0.01, 'max_depth': 13, 'min_child_weight': 4, 'n_estimators': 128, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.70000000000000018, 'subsample': 0.55000000000000004}
scores: [ 0.54733414  0.55364384  0.54192305]
scores_average: 0.54763367697
params: {'colsample_bytree': 0.39999999999999997, 'gamma': 0.91000000000000003, 'learning_rate': 0.01, 'max_depth': 11, 'min_child_weight': 6, 'n_estimators': 31, 'reg_alpha': 0.95000000000000029, 'reg_lambda': 0.35000000000000009, 'subsample': 0.70000000000000018}
scores: [ 0.54839797  0.5480993   0.53837763]
scores_average: 0.544958298302
params: {'colsample_bytree': 0.84999999999999987, 'gamma': 0.11, 'learning_rate': 0.01, 'max_depth': 14, 'min_child_weight': 13, 'n_estimators': 456, 'reg_alpha': 0.25000000000000006, 'reg_lambda': 0.20000000000000004, 'subsample': 0.75000000000000022}
scores: [ 0.54780606  0.561309    0.54368933]
scores_average:

In [80]:
 xgb = XGBClassifier(max_depth = 11,
                    learning_rate = 0.01,
                    n_estimators = 246 ,
                    min_child_weight = 17, 
                    gamma = 0.66 ,
                    subsample =0.60,
                    colsample_bytree = 0.69,
                    reg_alpha = 0.45,
                    reg_lambda = 0.75,
                    n_jobs = 10,
                    verbose = 0)
xgb.fit(train_x,train_y)
predict_y1 = xgb.predict_proba(test_x)

In [81]:
predict_y1

array([[ 0.52221131,  0.47778872],
       [ 0.52725506,  0.47274497],
       [ 0.5229578 ,  0.47704223],
       ..., 
       [ 0.4973684 ,  0.5026316 ],
       [ 0.49224454,  0.50775546],
       [ 0.49844652,  0.50155348]], dtype=float32)

In [None]:
#-------------------网格调参，不要运行------------------------
parameters = {
    'max_depth': ([2,3,4,5,6,7,8,9,10]),
    'learning_rate':[0.01,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5],
    'n_estimators':[50,100,150,200,250,300,350,400,500],
    'min_child_weight':[2,4,6,8,10,12,14,16,18,20], #建立每个模型所需要的最小样本数
    'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],                       #进一步划分所需的最小损失减少
    'subsample':[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9],                #用于训练模型的子样本占整个样本集合的比例
    'colsample_bytree':[0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9], #在建立树时对特征采样的比例
    'reg_alpha':[0.1,0.2,0.25,0.3,0.35,0.4,0.45,0.5],   #L1 正则的惩罚系数
    'reg_lambda':[0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5]   #L2 正则的惩罚系数
     }
xgb=XGBClassifier()
xgb.fit(train_x, train_y, eval_metric='rmse', verbose = False, eval_set = [(train_x, train_y)])
clf = GridSearchCV(xgb,parameters,n_jobs=10,cv=5)
clf.fit(train_x, train_y)
print(clf.best_score_,clf.best_params_)

In [5]:
# lr1调参
i = 1 #第一个训练集 
tdate = np.unique(df['tradeDate'].values)# 共计130个月
mark1 = tdate[6*i]
mark2 = tdate[6*i +3]
train_df_lr = df[df['tradeDate']<mark1]
train_df_lr = train_df_lr[train_df_lr['label']!=0]
train_df_lr.loc[:,'label'] = train_df_lr.loc[:,'label'].apply(lambda x:0 if x ==-1 else x)
test_df_lr = df[(df['tradeDate']>mark1) & (df['tradeDate']<mark2)]
# np.unique(test['tradeDate'].values) 测试过train和test的月份个数分别是72，2

train_x_lr = train_df_lr[factors]
train_y_lr = train_df_lr['label']
test_x_lr = test_df_lr[factors]
test_y_lr = test_df_lr['label']

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
train_x_lr = np.array(train_x_lr)
train_y_lr = np.array(train_y_lr)
parameters = {
    'C':[0.01,0.05,0.1,0.5,1,5,10],
    'max_iter':[50,100,150,200,250,300,350,400],
    'fit_intercept':['True','False']
}
lr = LogisticRegression(penalty = 'l2',verbose = 0)
lr.fit(train_x_lr, train_y_lr)
clf = GridSearchCV(lr,parameters,n_jobs=10,cv=5)
clf.fit(train_x_lr, train_y_lr)
print(clf.best_score_,clf.best_params_)

0.533606078317 {'C': 0.01, 'fit_intercept': 'True', 'max_iter': 50}


In [82]:
lr1 = LogisticRegression(C= 0.01,max_iter = 50,fit_intercept = True,penalty = 'l2',verbose = 0)
lr1.fit(train_x,train_y)
predict_y2 = lr.predict_proba(test_x)

In [83]:
len(predict_y2)

860

In [84]:
a = np.array(predict_y1)
b = np.array(predict_y2)
new_feature1 = pd.DataFrame()
new_feature2 = pd.DataFrame()
cols = ['f1','f2','f3','f4']
new_feature1['f1']= a[:,0]
new_feature1['f2']= a[:,1]
new_feature1['f3']= b[:,0]
new_feature1['f4']= b[:,1]

new_feature2['f1'] = a[:,1]
new_feature2['f2'] = b[:,1]
new_feature2.head()

Unnamed: 0,f1,f2
0,0.477789,0.45157
1,0.472745,0.505126
2,0.477042,0.313791
3,0.490025,0.510992
4,0.478876,0.42354


In [85]:
# lr2调参
lr2 = LogisticRegression()
lr2.fit(new_feature2,test_y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [86]:
# dev data 
def predict(dev_x):
    predict_y1 = xgb.predict_proba(dev_x)[:,1]
    predict_y2 = lr1.predict_proba(dev_x)[:,1]
    
    new_feature = pd.DataFrame()
    new_feature['f1'] = predict_y1
    new_feature['f2'] = predict_y2
    
    predict_score = lr2.predict_proba(new_feature)
    return predict_score

In [87]:
result = predict(dev_x)

In [89]:
result[:,1]

array([ 0.47517612,  0.53461114,  0.52767057,  0.48745222,  0.50056924,
        0.42473712,  0.52522845,  0.49328971,  0.48611874,  0.54154374,
        0.4946168 ,  0.50678512,  0.50994976,  0.46650545,  0.50765167,
        0.5048402 ,  0.44758615,  0.51213006,  0.49710364,  0.50997303,
        0.48860835,  0.49498495,  0.47524904,  0.53340668,  0.43040135,
        0.49967394,  0.51522342,  0.49193099,  0.48314706,  0.54834358,
        0.47991965,  0.54310969,  0.51114806,  0.52507374,  0.49426888,
        0.50473068,  0.46981217,  0.47782004,  0.51766526,  0.46738239,
        0.55624665,  0.50071116,  0.51775951,  0.46962674,  0.44307734,
        0.45690918,  0.52087248,  0.4847255 ,  0.53956572,  0.47241284,
        0.50897066,  0.49518924,  0.48767321,  0.54604298,  0.47811958,
        0.52596785,  0.53853402,  0.46156343,  0.43867204,  0.47877726,
        0.44881797,  0.49367138,  0.50399101,  0.48965417,  0.47023435,
        0.47955863,  0.49181336,  0.53912244,  0.51861896,  0.52