In [3]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm.sklearn import LGBMClassifier
factors = ['Beta60', 'OperatingRevenueGrowRate', 'NetProfitGrowRate', 'NetCashFlowGrowRate', 'NetProfitGrowRate5Y', 'TVSTD20',
           'TVSTD6', 'TVMA20', 'TVMA6', 'BLEV', 'MLEV', 'CashToCurrentLiability', 'CurrentRatio', 'REC', 'DAREC', 'GREC',
           'DASREV', 'SFY12P', 'LCAP', 'ASSI', 'LFLO', 'TA2EV', 'PEG5Y', 'PE', 'PB', 'PS', 'SalesCostRatio', 'PCF', 'CETOP',
           'TotalProfitGrowRate', 'CTOP', 'MACD', 'DEA', 'DIFF', 'RSI', 'PSY', 'BIAS10', 'ROE', 'ROA', 'ROA5', 'ROE5',
           'DEGM', 'GrossIncomeRatio', 'ROECut', 'NIAPCut', 'CurrentAssetsTRate', 'FixedAssetsTRate', 'FCFF', 'FCFE', 'PLRC6',
           'REVS5', 'REVS10', 'REVS20', 'REVS60', 'HSIGMA', 'HsigmaCNE5', 'ChaikinOscillator', 'ChaikinVolatility', 'Aroon',
           'DDI', 'MTM', 'MTMMA', 'VOL10', 'VOL20', 'VOL5', 'VOL60', 'RealizedVolatility', 'DASTD', 'DDNSR', 'Hurst']

df = pd.read_csv('dataset_factorRank10.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ticker,tradeDate,Beta60,OperatingRevenueGrowRate,NetProfitGrowRate,NetCashFlowGrowRate,NetProfitGrowRate5Y,TVSTD20,...,RealizedVolatility,DASTD,DDNSR,Hurst,next_month_end,abs_return,active_return,industryName1,label,year
0,0,0,1,20070131,9,8,1,8,6,3,...,10,8,10,10,20070228,-0.004221,-0.071021,银行,-1,2007
1,1,1,2,20070131,3,3,5,5,9,2,...,10,3,9,10,20070228,-0.037359,-0.104159,房地产,-1,2007
2,2,2,60,20070131,1,8,1,4,1,1,...,10,2,3,10,20070228,0.171481,0.104681,有色金属,0,2007
3,3,3,63,20070131,7,9,9,1,5,1,...,10,8,8,10,20070228,0.093903,0.027103,通信,0,2007
4,4,4,69,20070131,9,5,9,4,3,1,...,10,3,3,10,20070228,0.020656,-0.046144,房地产,-1,2007


In [10]:
import time
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('dataset_factorRank10.csv')

def get_train_val_test_data(df, year):
    back_year = max(2007, year-6)
    train_df = df[(df['year']>=back_year) & (df['year']<year)]
    
    test_df = df[df['year']==year]
    
    return train_df, test_df

def format_feature_label(origin_df, is_filter=True):
    if is_filter:
        origin_df = origin_df[origin_df['label']!=0]
        #模型的label输入范围替换成[0, 1]，比较直观，需要对原始label进行替换
        origin_df['label'] = origin_df['label'].replace(-1, 0)
        
    feature = np.array(origin_df[factors])
    label = np.array(origin_df['label'])
    return feature, label

def write_factor_to_csv(df, predict_score, year, filename):  
    print(len(df))
    print(len(predict_score))
    df.loc[:,'factor'] = predict_score
    df = df.loc[:, ['ticker', 'tradeDate', 'label', 'factor']]
    is_header = True
    if year != 2011:
        is_header = False
    
    df.to_csv(filename, mode='a+', encoding='utf-8', header=is_header)

def get_rf_result(train_data, train_label, test_data): #model 4
    lgb = LGBMClassifier(max_depth = 7,
                    learning_rate = 0.01,
                    n_estimators = 425 ,
                    min_child_weight = 14, 
                    gamma = 0.36 ,
                    subsample =0.55,
                    colsample_bytree = 0.5,
                    reg_alpha = 0.80,
                    min_gain_to_split = 0.9,
                    n_jobs = 10,
                    verbose = 0)
    lgb.fit(train_data, train_label,eval_metric='auc', verbose = 0)
    predict_score = lgb.predict_proba(test_data)[:,1]

    return predict_score

def pipeline():
    '''
    对7个阶段分别进行训练测试，并保存测试的因子合成值
    返回:
        boost_model_list, list结构，每个阶段汇总的模型集合
    '''
    t0 = time.time()
    linear_file = "lgb_year.csv"
    
    for year in range(2011,2018):#range(2011, 2018)
        print('training model for %s' % year)
        t1 = time.time()
        #构建训练测试数据x
        train_df, test_df = get_train_val_test_data(df, year)
        train_feature, train_label = format_feature_label(train_df)
        test_feature, test_label = format_feature_label(test_df,False)

        predict_score = get_rf_result(train_feature, train_label, test_feature)
        write_factor_to_csv(test_df, predict_score, year, linear_file)

        print('------------------ finish year: %s, time cost: %s seconds--------------' % (year, time.time() - t1))

    print('Done, Time cost: %s seconds' % (time.time() - t0))
pipeline()  

training model for 2011
8047
8047
------------------ finish year: 2011, time cost: 1.3115899562835693 seconds--------------
training model for 2012
8490
8490
------------------ finish year: 2012, time cost: 1.418198585510254 seconds--------------
training model for 2013
8607
8607
------------------ finish year: 2013, time cost: 1.6804847717285156 seconds--------------
training model for 2014
8728
8728
------------------ finish year: 2014, time cost: 1.6753933429718018 seconds--------------
training model for 2015
8966
8966
------------------ finish year: 2015, time cost: 1.8365395069122314 seconds--------------
training model for 2016
9168
9168
------------------ finish year: 2016, time cost: 1.9559760093688965 seconds--------------
training model for 2017
8771
8771
------------------ finish year: 2017, time cost: 1.9674458503723145 seconds--------------
Done, Time cost: 11.846729040145874 seconds
