In [7]:
import os
import time
import pandas as pd
import numpy as np
import seaborn as sns
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
factors = ['Beta60', 'OperatingRevenueGrowRate', 'NetProfitGrowRate', 'NetCashFlowGrowRate', 'NetProfitGrowRate5Y', 'TVSTD20',
           'TVSTD6', 'TVMA20', 'TVMA6', 'BLEV', 'MLEV', 'CashToCurrentLiability', 'CurrentRatio', 'REC', 'DAREC', 'GREC',
           'DASREV', 'SFY12P', 'LCAP', 'ASSI', 'LFLO', 'TA2EV', 'PEG5Y', 'PE', 'PB', 'PS', 'SalesCostRatio', 'PCF', 'CETOP',
           'TotalProfitGrowRate', 'CTOP', 'MACD', 'DEA', 'DIFF', 'RSI', 'PSY', 'BIAS10', 'ROE', 'ROA', 'ROA5', 'ROE5',
           'DEGM', 'GrossIncomeRatio', 'ROECut', 'NIAPCut', 'CurrentAssetsTRate', 'FixedAssetsTRate', 'FCFF', 'FCFE', 'PLRC6',
           'REVS5', 'REVS10', 'REVS20', 'REVS60', 'HSIGMA', 'HsigmaCNE5', 'ChaikinOscillator', 'ChaikinVolatility', 'Aroon',
           'DDI', 'MTM', 'MTMMA', 'VOL10', 'VOL20', 'VOL5', 'VOL60', 'RealizedVolatility', 'DASTD', 'DDNSR', 'Hurst']

df = pd.read_csv('dataset_1.csv',encoding='GBK')
df.head()

Unnamed: 0.1,Unnamed: 0,ticker,tradeDate,Beta60,OperatingRevenueGrowRate,NetProfitGrowRate,NetCashFlowGrowRate,NetProfitGrowRate5Y,TVSTD20,TVSTD6,...,RealizedVolatility,DASTD,DDNSR,Hurst,next_month_end,abs_return,active_return,industryName1,label,year
0,0,1,20070131,-0.838573,-0.527473,2.064908,-0.235434,-0.120729,0.478006,0.268336,...,0.0,-0.603847,-1.160788,0.0,20070228,-0.004221,-0.071021,银行,-1,2007
1,1,2,20070131,0.603209,0.476466,-0.019215,-0.013746,-1.210592,0.762963,0.696981,...,0.0,0.655525,-0.847547,0.0,20070228,-0.037359,-0.104159,房地产,-1,2007
2,2,60,20070131,1.439494,-0.559091,1.508711,0.085016,1.42378,2.340842,1.987375,...,0.0,0.74725,0.638908,0.0,20070228,0.171481,0.104681,有色金属,0,2007
3,3,63,20070131,-0.221436,-0.891169,-0.690564,1.331827,0.033096,1.378065,0.707162,...,0.0,-0.694765,-0.580936,0.0,20070228,0.093903,0.027103,通信,0,2007
4,4,69,20070131,-0.882542,0.101637,-0.671414,0.191923,0.463894,2.255643,1.680278,...,0.0,0.64097,0.450284,0.0,20070228,0.020656,-0.046144,房地产,-1,2007


In [8]:
import warnings
warnings.filterwarnings('ignore')


def get_train_val_test_data(df, year):
    back_year = max(2007, year-6)
    train_df = df[(df['year']>=back_year) & (df['year']<year)]
    
    test_df = df[df['year']==year]
    
    return train_df, test_df


def format_feature_label(origin_df, is_filter=False):
    if is_filter:
        origin_df = origin_df[origin_df['label']!=0]
        #模型的label输入范围替换成[0, 1]，比较直观，需要对原始label进行替换
        origin_df['label'] = origin_df['label'].replace(-1, 0)
        
    feature = np.array(origin_df[factors])
    label = np.array(origin_df['label'])
    return feature, label

def write_factor_to_csv(df, predict_score, year, filename):  
    df.loc[:,'factor'] = predict_score
    df = df.loc[:, ['ticker', 'tradeDate', 'active_return', 'factor']]
    is_header = True
    if year != 2011:
        is_header = False
    
    df.to_csv(filename, mode='a+', encoding='utf-8', header=is_header)


def get_rf_result(train_data, train_label, test_data): #model 4
    xgb = XGBClassifier(max_depth = 11,
                    learning_rate = 0.01,
                    n_estimators = 246 ,
                    min_child_weight = 17, 
                    gamma = 0.66 ,
                    subsample =0.60,
                    colsample_bytree = 0.69,
                    reg_alpha = 0.45,
                    reg_lambda = 0.75,
                    n_jobs = 10,
                    verbose = 0)
    xgb.fit(train_data, train_label, eval_metric='auc', verbose = 0)
    predict_score = xgb.predict_proba(test_data)[:, 1]   
    return predict_score

def pipeline():
    '''
    对7个阶段分别进行训练测试，并保存测试的因子合成值
    返回:
        boost_model_list, list结构，每个阶段汇总的模型集合
    '''
    t0 = time.time()
    linear_file = "xgb0524.csv"
    
    for year in range(2011,2018):#range(2011, 2018)
        print('training model for %s' % year)
        t1 = time.time()
        #构建训练测试数据x
        train_df, test_df = get_train_val_test_data(df, year)
        train_feature, train_label = format_feature_label(train_df)
        test_feature, test_label = format_feature_label(test_df)

        predict_score = get_rf_result(train_feature, train_label, test_feature)
        write_factor_to_csv(test_df, predict_score, year, linear_file)

        print('------------------ finish year: %s, time cost: %s seconds--------------' % (year, time.time() - t1))

    print('Done, Time cost: %s seconds' % (time.time() - t0))
pipeline()  

training model for 2011
------------------ finish year: 2011, time cost: 5.677395820617676 seconds--------------
training model for 2012
------------------ finish year: 2012, time cost: 7.582930326461792 seconds--------------
training model for 2013
------------------ finish year: 2013, time cost: 9.981031656265259 seconds--------------
training model for 2014
------------------ finish year: 2014, time cost: 11.080323696136475 seconds--------------
training model for 2015
------------------ finish year: 2015, time cost: 11.71137285232544 seconds--------------
training model for 2016
------------------ finish year: 2016, time cost: 12.481510639190674 seconds--------------
training model for 2017
------------------ finish year: 2017, time cost: 13.002912759780884 seconds--------------
Done, Time cost: 71.51827263832092 seconds
