In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
import time
import warnings
warnings.filterwarnings('ignore')


In [11]:
factors = ['Beta60', 'OperatingRevenueGrowRate', 'NetProfitGrowRate', 'NetCashFlowGrowRate', 'NetProfitGrowRate5Y', 'TVSTD20',
           'TVSTD6', 'TVMA20', 'TVMA6', 'BLEV', 'MLEV', 'CashToCurrentLiability', 'CurrentRatio', 'REC', 'DAREC', 'GREC',
           'DASREV', 'SFY12P', 'LCAP', 'ASSI', 'LFLO', 'TA2EV', 'PEG5Y', 'PE', 'PB', 'PS', 'SalesCostRatio', 'PCF', 'CETOP',
           'TotalProfitGrowRate', 'CTOP', 'MACD', 'DEA', 'DIFF', 'RSI', 'PSY', 'BIAS10', 'ROE', 'ROA', 'ROA5', 'ROE5',
           'DEGM', 'GrossIncomeRatio', 'ROECut', 'NIAPCut', 'CurrentAssetsTRate', 'FixedAssetsTRate', 'FCFF', 'FCFE', 'PLRC6',
           'REVS5', 'REVS10', 'REVS20', 'REVS60', 'HSIGMA', 'HsigmaCNE5', 'ChaikinOscillator', 'ChaikinVolatility', 'Aroon',
           'DDI', 'MTM', 'MTMMA', 'VOL10', 'VOL20', 'VOL5', 'VOL60', 'RealizedVolatility', 'DASTD', 'DDNSR', 'Hurst']

df_ori = pd.read_csv('dataset_factorRank10.csv')

In [12]:
data_batch = []
tdate = np.unique(df_ori['tradeDate'].values)# 共计130个月
for i in range(0,57):
    mark1 = tdate[i]
    mark2 = tdate[73+i]
    batch = df_ori[(df_ori['tradeDate']>=mark1) & (df_ori['tradeDate']<=mark2)]
    data_batch.append(batch)
    
def train_test_dev(i):
    batch = data_batch[i]
    tdate_sub = np.unique(batch['tradeDate'].values)
    
    cut1 = tdate_sub[73]
    test_df = batch[batch['tradeDate']==cut1]
    leave = batch[batch['tradeDate']<cut1]
    
    print('cut1:',cut1)

    # 用于训练的数据都屏蔽掉label为0的情况,并把label为-1的替换为0
    leave = leave[leave['label']!=0]
    leave.loc[:,'label'] = leave.loc[:,'label'].apply(lambda x:0 if x ==-1 else x)

    train_df = leave
    return train_df,test_df
    
def train_predict(i):
    train_df,test_df = train_test_dev(i)

    train_x = train_df[factors]
    train_y = train_df['label']
    test_x = test_df[factors]

    lgb = LGBMClassifier(max_depth = 7,
                    learning_rate = 0.01,
                    n_estimators = 425 ,
                    min_child_weight = 14, 
                    gamma = 0.36 ,
                    subsample =0.55,
                    colsample_bytree = 0.5,
                    reg_alpha = 0.80,
                    min_gain_to_split = 0.9,
                    n_jobs = 10,
                    verbose = 0)
    lgb.fit(train_x, train_y,eval_metric='auc', verbose = 0)
    predict_score = lgb.predict_proba(test_x)[:,1]

    return predict_score
    
def result_summary(df):
    tdate = np.unique(df_ori['tradeDate'].values)
    df = df[df['tradeDate']>=tdate[73]]
    i = 0
    new_df = None
    for tdate,group in df.groupby('tradeDate'):
        print('round:',i)
        result = train_predict(i)
        print('tdate:',tdate)

        group.loc[:,'factor'] = result
        i += 1
        
        if new_df is None:
            new_df = group
        else:
            new_df = pd.concat([new_df, group],ignore_index=True)
    return new_df

In [13]:
new_df = result_summary(df_ori)

round: 0
cut1: 20130228
tdate: 20130228
round: 1
cut1: 20130329
tdate: 20130329
round: 2
cut1: 20130426
tdate: 20130426
round: 3
cut1: 20130531
tdate: 20130531
round: 4
cut1: 20130628
tdate: 20130628
round: 5
cut1: 20130731
tdate: 20130731
round: 6
cut1: 20130830
tdate: 20130830
round: 7
cut1: 20130930
tdate: 20130930
round: 8
cut1: 20131031
tdate: 20131031
round: 9
cut1: 20131129
tdate: 20131129
round: 10
cut1: 20131231
tdate: 20131231
round: 11
cut1: 20140130
tdate: 20140130
round: 12
cut1: 20140228
tdate: 20140228
round: 13
cut1: 20140331
tdate: 20140331
round: 14
cut1: 20140430
tdate: 20140430
round: 15
cut1: 20140530
tdate: 20140530
round: 16
cut1: 20140630
tdate: 20140630
round: 17
cut1: 20140731
tdate: 20140731
round: 18
cut1: 20140829
tdate: 20140829
round: 19
cut1: 20140930
tdate: 20140930
round: 20
cut1: 20141031
tdate: 20141031
round: 21
cut1: 20141128
tdate: 20141128
round: 22
cut1: 20141231
tdate: 20141231
round: 23
cut1: 20150130
tdate: 20150130
round: 24
cut1: 20150227
t

In [14]:
newdf = new_df.loc[:, ['ticker', 'tradeDate', 'label', 'factor']]
newdf.to_csv('lgb_roll_month.csv')