In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import time
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

factors = ['Beta60', 'OperatingRevenueGrowRate', 'NetProfitGrowRate', 'NetCashFlowGrowRate', 'NetProfitGrowRate5Y', 'TVSTD20',
           'TVSTD6', 'TVMA20', 'TVMA6', 'BLEV', 'MLEV', 'CashToCurrentLiability', 'CurrentRatio', 'REC', 'DAREC', 'GREC',
           'DASREV', 'SFY12P', 'LCAP', 'ASSI', 'LFLO', 'TA2EV', 'PEG5Y', 'PE', 'PB', 'PS', 'SalesCostRatio', 'PCF', 'CETOP',
           'TotalProfitGrowRate', 'CTOP', 'MACD', 'DEA', 'DIFF', 'RSI', 'PSY', 'BIAS10', 'ROE', 'ROA', 'ROA5', 'ROE5',
           'DEGM', 'GrossIncomeRatio', 'ROECut', 'NIAPCut', 'CurrentAssetsTRate', 'FixedAssetsTRate', 'FCFF', 'FCFE', 'PLRC6',
           'REVS5', 'REVS10', 'REVS20', 'REVS60', 'HSIGMA', 'HsigmaCNE5', 'ChaikinOscillator', 'ChaikinVolatility', 'Aroon',
           'DDI', 'MTM', 'MTMMA', 'VOL10', 'VOL20', 'VOL5', 'VOL60', 'RealizedVolatility', 'DASTD', 'DDNSR', 'Hurst']

df_ori = pd.read_csv('dataset2_factorRank10.csv')

In [2]:
len( np.unique(df_ori['tradeDate'].values))

135

In [3]:
data_batch = []
tdate = np.unique(df_ori['tradeDate'].values)# 共计130个月
for i in range(0,61):
    mark1 = tdate[i]
    mark2 = tdate[74+i]
    batch = df_ori[(df_ori['tradeDate']>=mark1) & (df_ori['tradeDate']<=mark2)]
    data_batch.append(batch)
    
def train_test_dev(i):
    batch = data_batch[i]
    tdate_sub = np.unique(batch['tradeDate'].values)
    
    cut1 = tdate_sub[74]
    cut2 = tdate_sub[72]
    test_df = batch[batch['tradeDate']==cut1]
    leave = batch[batch['tradeDate']<cut1]
    
    print('cut1:',cut1)

    # 用于训练的数据都屏蔽掉label为0的情况,并把label为-1的替换为0
    leave = leave[leave['label']!=0]
    leave.loc[:,'label'] = leave.loc[:,'label'].apply(lambda x:0 if x ==-1 else x)

    train_df = leave[leave['tradeDate'] < cut2]
    dev_df = leave[(leave['tradeDate']>=cut2) & (leave['tradeDate']<cut1)]

    return train_df,dev_df,test_df
    
def train_predict(i):
    train_df,dev_df,test_df = train_test_dev(i)

    train_x = train_df[factors]
    train_y = train_df['label']
    dev_x = dev_df[factors]
    dev_y = dev_df['label']
    test_x = test_df[factors]

    rf = RandomForestClassifier(n_estimators = 50,
                                min_samples_split = 50,
                                min_samples_leaf =13,
                                n_jobs = 8,
                                max_depth = 14)
    rf.fit(train_x, train_y)

    lr1 = LogisticRegression(C= 0.01,max_iter = 50,fit_intercept = True,penalty = 'l2',verbose = 0)
    lr1.fit(train_x,train_y)

    new_feature_dev = pd.DataFrame()
    new_feature_dev['1'] = rf.predict_proba(dev_x)[:,1]
    new_feature_dev['2'] = lr1.predict_proba(dev_x)[:,1]
    
    lr2 = LogisticRegression()
    lr2.fit(new_feature_dev,dev_y)
    
    #print('train process is done,start predict')
    
    new_feature_test = pd.DataFrame()
    new_feature_test['1'] = rf.predict_proba(test_x)[:,1]
    new_feature_test['2'] = lr1.predict_proba(test_x)[:,1]
    predict_score = lr2.predict_proba(new_feature_test)[:,1]
    
    #print('predict is done')
    return predict_score
    
def result_summary(df):
    tdate = np.unique(df_ori['tradeDate'].values)
    df = df[df['tradeDate']>=tdate[74]]
    i = 0
    new_df = None
    for tdate,group in df.groupby('tradeDate'):
        print('round:',i)
        result = train_predict(i)
        print('tdate:',tdate)

        group.loc[:,'factor'] = result
        i += 1
        
        if new_df is None:
            new_df = group
        else:
            new_df = pd.concat([new_df, group],ignore_index=True)
    return new_df

In [4]:
new_df = result_summary(df_ori)

round: 0
cut1: 20130329
tdate: 20130329
round: 1
cut1: 20130426
tdate: 20130426
round: 2
cut1: 20130531
tdate: 20130531
round: 3
cut1: 20130628
tdate: 20130628
round: 4
cut1: 20130731
tdate: 20130731
round: 5
cut1: 20130830
tdate: 20130830
round: 6
cut1: 20130930
tdate: 20130930
round: 7
cut1: 20131031
tdate: 20131031
round: 8
cut1: 20131129
tdate: 20131129
round: 9
cut1: 20131231
tdate: 20131231
round: 10
cut1: 20140130
tdate: 20140130
round: 11
cut1: 20140228
tdate: 20140228
round: 12
cut1: 20140331
tdate: 20140331
round: 13
cut1: 20140430
tdate: 20140430
round: 14
cut1: 20140530
tdate: 20140530
round: 15
cut1: 20140630
tdate: 20140630
round: 16
cut1: 20140731
tdate: 20140731
round: 17
cut1: 20140829
tdate: 20140829
round: 18
cut1: 20140930
tdate: 20140930
round: 19
cut1: 20141031
tdate: 20141031
round: 20
cut1: 20141128
tdate: 20141128
round: 21
cut1: 20141231
tdate: 20141231
round: 22
cut1: 20150130
tdate: 20150130
round: 23
cut1: 20150227
tdate: 20150227
round: 24
cut1: 20150331
t

In [5]:
new_df.to_csv('stacking_xgb72_lr72.csv')