In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import time
import matplotlib.pyplot as plt
from sklearn import tree
from xgboost import XGBClassifier

In [2]:
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [3]:
data = pd.read_csv('datasets/total_data.csv', encoding='gbk')

In [4]:
X = data[['净三小0', '净利润_mean', '纳税总额_mean', '纳税/净利_mean', '净利/资产_mean', '净利/营收_mean',
          '纳税/营收_mean', '净利/利润_mean', '净利/负债_mean', '纳税/负债_mean', '纳税/融资额度_mean', 
          '纳税/净利_min', '所得税_min', '所得税/纳税_min', '净利润_max', '纳税总额_max', '纳税/净利_max', 
          '净利/资产_max', '净利/营收_max', '纳税/营收_max', '净利/利润_max', '净利/负债_max', '纳税/负债_max', 
          '净利/融资额度_max', '纳税/融资额度_max', '融资/负债_max', '纳税总额_std', '纳税/净利_std', 
          '净利/营收_std', '纳税/营收_std', '所得税/纳税_std', '纳税/负债_std', '纳税/融资额度_std', 
          '净利/所有者_mean',  '政策/净利_mean', '融资/负债_mean' ,'政策/负债_mean', '接受补助_mean', 
          '从业人数_min', '资产总额_min', '负债总额_min', '营业总收入_min', '主营业务收入_min', '利润总额_min',
          '净利润_min', '纳税总额_min', '所有者权益合计_min', '负债/资产_min', '主收/营收_min', '净利/资产_min',
          '净利/营收_min', '纳税/营收_min', '利润/营收_min', '净利/利润_min', '净利/负债_min', '纳税/负债_min', 
          '负债/所有者_min', '费用_min', '费用/营收_min', '利润/费用_min', '净利/融资额度_min', 
          '纳税/融资额度_min', '营收/资产_min', '所有者/资产_min', '净利/所有者_min', '政策/净利_min', 
          '融资/负债_min', '政策/负债_min', '接受补助_min', '资产总额_max', '负债总额_max', '营业总收入_max',
          '主营业务收入_max','利润总额_max', '所有者权益合计_max', '负债/资产_max', '主收/营收_max', 
          '利润/营收_max', '所得税_max', '所得税/纳税_max', '负债/所有者_max', '费用_max', '费用/营收_max', 
          '利润/费用_max', '营收/资产_max', '所有者/资产_max', '净利/所有者_max', '政策/净利_max', 
          '政策/负债_max', '接受补助_max', '从业人数_std', '资产总额_std', '负债总额_std', '营业总收入_std', 
          '主营业务收入_std', '利润总额_std', '净利润_std', '所有者权益合计_std', '负债/资产_std', '主收/营收_std', 
          '净利/资产_std', '利润/营收_std', '净利/利润_std', '所得税_std', '净利/负债_std', '负债/所有者_std', '费用_std',
          '费用/营收_std', '利润/费用_std', '净利/融资额度_std', '营收/资产_std', '所有者/资产_std',
          '净利/所有者_std', '政策/净利_std', '融资/负债_std', '政策/负债_std', '接受补助_std', '从业人数_growth',
          '资产总额_growth', '负债总额_growth', '营业总收入_growth', '主营业务收入_growth', '利润总额_growth',
          '净利润_growth', '纳税总额_growth', '所有者权益合计_growth', '负债/资产_growth', '负债/所有者_growth',
          '所有者/资产_growth', '净利/所有者_growth', '政策/净利_growth'
         ]]

In [5]:
y = data['flag']

In [6]:
clf_rf = RandomForestClassifier(class_weight='balanced', n_estimators=100)
clf_dt = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=4)
clf_xgb = XGBClassifier(max_depth=3, gamma=1, reg_alpha=1, reg_lambda=1)
eclf = VotingClassifier(estimators=[('rf', clf_rf), ('dt', clf_dt), ('xgb', clf_xgb)], voting='soft', weights=[0.762, 0.362, 0.707])

In [7]:
data_iter = pd.read_csv('datasets/total_data_semi.csv', encoding='gbk')

In [8]:
X_iter = data_iter[['净三小0', '净利润_mean', '纳税总额_mean', '纳税/净利_mean', '净利/资产_mean', '净利/营收_mean',
          '纳税/营收_mean', '净利/利润_mean', '净利/负债_mean', '纳税/负债_mean', '纳税/融资额度_mean', 
          '纳税/净利_min', '所得税_min', '所得税/纳税_min', '净利润_max', '纳税总额_max', '纳税/净利_max', 
          '净利/资产_max', '净利/营收_max', '纳税/营收_max', '净利/利润_max', '净利/负债_max', '纳税/负债_max', 
          '净利/融资额度_max', '纳税/融资额度_max', '融资/负债_max', '纳税总额_std', '纳税/净利_std', 
          '净利/营收_std', '纳税/营收_std', '所得税/纳税_std', '纳税/负债_std', '纳税/融资额度_std', 
          '净利/所有者_mean',  '政策/净利_mean', '融资/负债_mean' ,'政策/负债_mean', '接受补助_mean', 
          '从业人数_min', '资产总额_min', '负债总额_min', '营业总收入_min', '主营业务收入_min', '利润总额_min',
          '净利润_min', '纳税总额_min', '所有者权益合计_min', '负债/资产_min', '主收/营收_min', '净利/资产_min',
          '净利/营收_min', '纳税/营收_min', '利润/营收_min', '净利/利润_min', '净利/负债_min', '纳税/负债_min', 
          '负债/所有者_min', '费用_min', '费用/营收_min', '利润/费用_min', '净利/融资额度_min', 
          '纳税/融资额度_min', '营收/资产_min', '所有者/资产_min', '净利/所有者_min', '政策/净利_min', 
          '融资/负债_min', '政策/负债_min', '接受补助_min', '资产总额_max', '负债总额_max', '营业总收入_max',
          '主营业务收入_max','利润总额_max', '所有者权益合计_max', '负债/资产_max', '主收/营收_max', 
          '利润/营收_max', '所得税_max', '所得税/纳税_max', '负债/所有者_max', '费用_max', '费用/营收_max', 
          '利润/费用_max', '营收/资产_max', '所有者/资产_max', '净利/所有者_max', '政策/净利_max', 
          '政策/负债_max', '接受补助_max', '从业人数_std', '资产总额_std', '负债总额_std', '营业总收入_std', 
          '主营业务收入_std', '利润总额_std', '净利润_std', '所有者权益合计_std', '负债/资产_std', '主收/营收_std', 
          '净利/资产_std', '利润/营收_std', '净利/利润_std', '所得税_std', '净利/负债_std', '负债/所有者_std', '费用_std',
          '费用/营收_std', '利润/费用_std', '净利/融资额度_std', '营收/资产_std', '所有者/资产_std',
          '净利/所有者_std', '政策/净利_std', '融资/负债_std', '政策/负债_std', '接受补助_std', '从业人数_growth',
          '资产总额_growth', '负债总额_growth', '营业总收入_growth', '主营业务收入_growth', '利润总额_growth',
          '净利润_growth', '纳税总额_growth', '所有者权益合计_growth', '负债/资产_growth', '负债/所有者_growth',
          '所有者/资产_growth', '净利/所有者_growth', '政策/净利_growth'
         ]]

In [9]:
X_iter

Unnamed: 0,净三小0,净利润_mean,纳税总额_mean,纳税/净利_mean,净利/资产_mean,净利/营收_mean,纳税/营收_mean,净利/利润_mean,净利/负债_mean,纳税/负债_mean,...,主营业务收入_growth,利润总额_growth,净利润_growth,纳税总额_growth,所有者权益合计_growth,负债/资产_growth,负债/所有者_growth,所有者/资产_growth,净利/所有者_growth,政策/净利_growth
0,1.0,-23540.000000,0.000000,0.000000,-0.159998,-0.099999,0.000000,-0.333328,-0.097751,0.000000,...,0.700537,0.204312,0.851108,0.000000,0.679244,0.017681,0.172510,1.201676,0.019301,0.000000
1,1.0,-57505.200000,0.000000,0.000000,-0.193333,-0.100000,0.000000,-0.199998,-0.151839,0.000000,...,2.610889,4.273143,4.273677,0.000000,-0.180930,0.434673,-1.340357,-0.782965,0.337216,0.000000
2,1.0,-43001.800000,0.000000,0.000000,-0.213332,-0.100000,0.000000,-0.261109,-0.367671,0.000000,...,1.323982,0.459928,0.920170,0.000000,-12.548150,-0.075816,-1.475628,0.689580,-1.095509,-0.055556
3,1.0,-2349.333333,0.000000,0.000000,-0.393266,-0.099995,0.000000,-0.333280,-0.229654,0.000000,...,0.172691,-0.308823,0.130972,0.000000,-0.392902,-0.085042,0.538105,4.043982,0.242183,0.000000
4,1.0,-21470.400000,0.000000,0.000000,-0.199998,-0.099999,0.000000,-0.511097,-0.284497,0.000000,...,-0.499402,0.594428,-0.283344,0.000000,2.249736,-0.126050,-0.524234,0.223195,-1.066971,0.045456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10278,0.0,47715.200000,105123.200000,57408.666650,0.246665,0.133333,0.266666,0.555541,0.275685,0.932820,...,1.955748,3.265400,1.440950,2.165734,-0.484846,0.150256,6.374470,-0.141581,7.984570,2340.000000
10279,0.0,26379.600000,103443.600000,56909.633310,0.293329,0.199996,0.399997,0.777725,0.163142,1.339406,...,-0.551038,-0.637023,33344.593339,-0.586037,-7.345213,0.229558,-0.422101,0.309246,0.096342,0.000000
10280,0.0,1725.666667,2285.733333,861.942691,0.466540,0.133306,0.399846,0.283175,0.345721,0.698829,...,7.798054,5.894051,15.099459,1.183823,-4.186638,-0.065763,-1.486383,0.508862,-2.060575,0.000000
10281,0.0,7783.266667,21410.200000,2.332940,0.309990,0.066666,0.266662,0.122220,0.113529,0.257431,...,0.755188,0.046975,-2.334541,-0.523793,-0.186492,-0.221985,-1.441416,6.799826,-2.357295,0.000000


In [10]:
preshape = 0
while X_iter.shape[0] and X_iter.shape[0] != preshape:
    preshape = X_iter.shape[0]
    print(preshape)
    eclf.fit(X, y)
    preds = eclf.predict(X_iter)
    probs = eclf.predict_proba(X_iter)
    predsandprobs = pd.DataFrame({'flag': preds, 'probs': np.max(probs, 1)})
    X_iter = pd.concat([X_iter, predsandprobs], axis=1)
    maxprobs = X_iter['probs'].max()
    if maxprobs < 0.95:
        break
    X_iter1 = X_iter[X_iter['probs'] == maxprobs]
    X_iter = X_iter[X_iter['probs'] != maxprobs][['净三小0', '净利润_mean', '纳税总额_mean', '纳税/净利_mean', '净利/资产_mean', '净利/营收_mean',
          '纳税/营收_mean', '净利/利润_mean', '净利/负债_mean', '纳税/负债_mean', '纳税/融资额度_mean', 
          '纳税/净利_min', '所得税_min', '所得税/纳税_min', '净利润_max', '纳税总额_max', '纳税/净利_max', 
          '净利/资产_max', '净利/营收_max', '纳税/营收_max', '净利/利润_max', '净利/负债_max', '纳税/负债_max', 
          '净利/融资额度_max', '纳税/融资额度_max', '融资/负债_max', '纳税总额_std', '纳税/净利_std', 
          '净利/营收_std', '纳税/营收_std', '所得税/纳税_std', '纳税/负债_std', '纳税/融资额度_std', 
          '净利/所有者_mean',  '政策/净利_mean', '融资/负债_mean' ,'政策/负债_mean', '接受补助_mean', 
          '从业人数_min', '资产总额_min', '负债总额_min', '营业总收入_min', '主营业务收入_min', '利润总额_min',
          '净利润_min', '纳税总额_min', '所有者权益合计_min', '负债/资产_min', '主收/营收_min', '净利/资产_min',
          '净利/营收_min', '纳税/营收_min', '利润/营收_min', '净利/利润_min', '净利/负债_min', '纳税/负债_min', 
          '负债/所有者_min', '费用_min', '费用/营收_min', '利润/费用_min', '净利/融资额度_min', 
          '纳税/融资额度_min', '营收/资产_min', '所有者/资产_min', '净利/所有者_min', '政策/净利_min', 
          '融资/负债_min', '政策/负债_min', '接受补助_min', '资产总额_max', '负债总额_max', '营业总收入_max',
          '主营业务收入_max','利润总额_max', '所有者权益合计_max', '负债/资产_max', '主收/营收_max', 
          '利润/营收_max', '所得税_max', '所得税/纳税_max', '负债/所有者_max', '费用_max', '费用/营收_max', 
          '利润/费用_max', '营收/资产_max', '所有者/资产_max', '净利/所有者_max', '政策/净利_max', 
          '政策/负债_max', '接受补助_max', '从业人数_std', '资产总额_std', '负债总额_std', '营业总收入_std', 
          '主营业务收入_std', '利润总额_std', '净利润_std', '所有者权益合计_std', '负债/资产_std', '主收/营收_std', 
          '净利/资产_std', '利润/营收_std', '净利/利润_std', '所得税_std', '净利/负债_std', '负债/所有者_std', '费用_std',
          '费用/营收_std', '利润/费用_std', '净利/融资额度_std', '营收/资产_std', '所有者/资产_std',
          '净利/所有者_std', '政策/净利_std', '融资/负债_std', '政策/负债_std', '接受补助_std', '从业人数_growth',
          '资产总额_growth', '负债总额_growth', '营业总收入_growth', '主营业务收入_growth', '利润总额_growth',
          '净利润_growth', '纳税总额_growth', '所有者权益合计_growth', '负债/资产_growth', '负债/所有者_growth',
          '所有者/资产_growth', '净利/所有者_growth', '政策/净利_growth'
         ]]
    X_iter = X_iter.reset_index()
    X_iter.drop('index', axis=1, inplace=True)
    X = pd.concat([X, X_iter1[['净三小0', '净利润_mean', '纳税总额_mean', '纳税/净利_mean', '净利/资产_mean', '净利/营收_mean',
          '纳税/营收_mean', '净利/利润_mean', '净利/负债_mean', '纳税/负债_mean', '纳税/融资额度_mean', 
          '纳税/净利_min', '所得税_min', '所得税/纳税_min', '净利润_max', '纳税总额_max', '纳税/净利_max', 
          '净利/资产_max', '净利/营收_max', '纳税/营收_max', '净利/利润_max', '净利/负债_max', '纳税/负债_max', 
          '净利/融资额度_max', '纳税/融资额度_max', '融资/负债_max', '纳税总额_std', '纳税/净利_std', 
          '净利/营收_std', '纳税/营收_std', '所得税/纳税_std', '纳税/负债_std', '纳税/融资额度_std', 
          '净利/所有者_mean',  '政策/净利_mean', '融资/负债_mean' ,'政策/负债_mean', '接受补助_mean', 
          '从业人数_min', '资产总额_min', '负债总额_min', '营业总收入_min', '主营业务收入_min', '利润总额_min',
          '净利润_min', '纳税总额_min', '所有者权益合计_min', '负债/资产_min', '主收/营收_min', '净利/资产_min',
          '净利/营收_min', '纳税/营收_min', '利润/营收_min', '净利/利润_min', '净利/负债_min', '纳税/负债_min', 
          '负债/所有者_min', '费用_min', '费用/营收_min', '利润/费用_min', '净利/融资额度_min', 
          '纳税/融资额度_min', '营收/资产_min', '所有者/资产_min', '净利/所有者_min', '政策/净利_min', 
          '融资/负债_min', '政策/负债_min', '接受补助_min', '资产总额_max', '负债总额_max', '营业总收入_max',
          '主营业务收入_max','利润总额_max', '所有者权益合计_max', '负债/资产_max', '主收/营收_max', 
          '利润/营收_max', '所得税_max', '所得税/纳税_max', '负债/所有者_max', '费用_max', '费用/营收_max', 
          '利润/费用_max', '营收/资产_max', '所有者/资产_max', '净利/所有者_max', '政策/净利_max', 
          '政策/负债_max', '接受补助_max', '从业人数_std', '资产总额_std', '负债总额_std', '营业总收入_std', 
          '主营业务收入_std', '利润总额_std', '净利润_std', '所有者权益合计_std', '负债/资产_std', '主收/营收_std', 
          '净利/资产_std', '利润/营收_std', '净利/利润_std', '所得税_std', '净利/负债_std', '负债/所有者_std', '费用_std',
          '费用/营收_std', '利润/费用_std', '净利/融资额度_std', '营收/资产_std', '所有者/资产_std',
          '净利/所有者_std', '政策/净利_std', '融资/负债_std', '政策/负债_std', '接受补助_std', '从业人数_growth',
          '资产总额_growth', '负债总额_growth', '营业总收入_growth', '主营业务收入_growth', '利润总额_growth',
          '净利润_growth', '纳税总额_growth', '所有者权益合计_growth', '负债/资产_growth', '负债/所有者_growth',
          '所有者/资产_growth', '净利/所有者_growth', '政策/净利_growth'
         ]]])
    X = X.reset_index()
    X.drop('index', axis=1, inplace=True)
    y = pd.concat([y, X_iter1['flag']])
    y = y.reset_index()
    y.drop('index', axis=1, inplace=True)
    y = y['flag']

10283
3110
2776
2768
2767
2766
2737
2726
2712
2708
750
749
747
721
720
719
718
603
600
599
598
597
596
593
592
519
512
511
510
508
505
483
472
471
468
467
466
333
329
327
325
318
317
316
315
314
313
312
307
305
304
303
301
293
292
286
285
262
259
258
252
251
250
245
243
240
239
205
203
202
200
194
190
186
185
183
182
170
169
168
167
166
165
163
162
161
159
158
147
141
140
139
129
128
127
121
120
119
118
116
115
104
100
99
97
96
95
94
88
86
85
84
82
60
59
48
45
43
32
31
28
27
26
24
23
22
21
17
16
15
14
13
12
11
10
9
8
7
6
5
4
3


In [11]:
X

Unnamed: 0,净三小0,净利润_mean,纳税总额_mean,纳税/净利_mean,净利/资产_mean,净利/营收_mean,纳税/营收_mean,净利/利润_mean,净利/负债_mean,纳税/负债_mean,...,主营业务收入_growth,利润总额_growth,净利润_growth,纳税总额_growth,所有者权益合计_growth,负债/资产_growth,负债/所有者_growth,所有者/资产_growth,净利/所有者_growth,政策/净利_growth
0,1.0,-19707.333333,0.000000,0.000000,-0.373321,-0.099999,0.000000,-0.733267,-0.280844,0.000000,...,2.302048,11.065098,1.651893,0.000000,-20.073873,0.106484,-14.506308,1.079509,-6.052426,0.000000
1,1.0,-10046.400000,0.000000,0.000000,-0.266658,-0.099998,0.000000,-0.244437,-0.488728,0.000000,...,0.142823,-0.432522,-0.226643,0.000000,-11.371699,0.314690,3.503315,-0.410945,-13.904677,0.021279
2,1.0,-105177.366667,0.000000,0.000000,-0.296666,-0.100000,0.000000,-0.606998,-0.278097,0.000000,...,0.626039,1.137118,0.819586,0.000000,-14.017837,0.032828,-9.561076,0.225698,-5.328479,0.000000
3,1.0,-38678.933333,0.000000,0.000000,-0.416661,-0.100000,0.000000,-0.344439,-0.498608,0.000000,...,-0.126436,1.275312,0.013048,0.000000,-3.583157,-0.048588,-1.605899,1.115250,-1.008677,0.044445
4,1.0,-113272.800000,0.000000,0.000000,-0.279999,-0.100000,0.000000,-0.583326,-0.194148,0.000000,...,1.504705,4.339257,2.195290,0.000000,2.362229,0.003884,-0.004057,-0.007708,0.232183,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45923,0.0,-3233.400000,132.600000,0.332497,-0.053333,0.029750,0.066633,0.088523,-0.035907,0.014443,...,17.734366,26.164590,-8.769735,-0.498746,-2.418339,0.242084,-8.539482,0.138543,-0.989366,0.133389
45924,0.0,-5352.600000,25561.463903,-2.420362,-0.196659,0.066643,0.325320,0.432732,-0.408551,2.226689,...,4.655447,9.027615,-2.799139,33.677251,0.023200,-0.001231,-0.003463,0.001255,0.439964,-0.179631
45925,0.0,3754.866667,16308.000000,0.999939,0.169993,-0.050883,0.047301,0.223718,0.191728,0.635269,...,8.269777,33.407913,8.792561,24462.000000,-0.256198,-0.174246,-4.233086,1.862639,4.606606,-0.748230
45926,0.0,-36642.133333,26705.130570,-3.219565,-0.156666,0.033329,0.410787,0.083296,-0.095821,0.442435,...,28.230386,31.493482,-9.528498,-0.969987,-1.128495,0.497383,-0.756273,-0.490762,0.286529,-0.022727


In [13]:
clf_rf = RandomForestClassifier(class_weight='balanced', n_estimators=100)
clf_dt = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=4)
clf_xgb = XGBClassifier(max_depth=3, gamma=1, reg_alpha=1, reg_lambda=1)
eclf = VotingClassifier(estimators=[('rf', clf_rf), ('dt', clf_dt), ('xgb', clf_xgb)], voting='soft', weights=[0.762, 0.362, 0.707])
eclf.fit(X, y)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight='balanced',
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
  

In [14]:
import joblib

In [15]:
joblib.dump(eclf, 'eclf.joblib')

['eclf.joblib']

In [16]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [17]:
clf_rf = RandomForestClassifier(class_weight='balanced', n_estimators=100)
clf_dt = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=4)
clf_xgb = XGBClassifier(max_depth=3, gamma=1, reg_alpha=1, reg_lambda=1)
eclf = VotingClassifier(estimators=[('rf', clf_rf), ('dt', clf_dt), ('xgb', clf_xgb)], voting='soft', weights=[0.762, 0.362, 0.707])
accuracy_eclf = []
precision_eclf = []
recall_eclf = []
f1_eclf = []
traintime_eclf = 0
testtime_eclf = 0
for train, test in kf.split(X, y):
    st = time.time()
    eclf.fit(X.iloc[train], y.iloc[train])
    ed = time.time()
    traintime_eclf += (ed-st)
    y_pred = eclf.predict(X.iloc[test])
    edd = time.time()
    testtime_eclf += (edd-ed)
    accuracy_eclf.append(accuracy_score(y_pred, y.iloc[test]))
    precision_eclf.append(precision_score(y_pred, y.iloc[test]))
    recall_eclf.append(recall_score(y_pred, y.iloc[test]))
    f1_eclf.append(f1_score(y_pred, y.iloc[test]))
print("Voting")
print("Accuracy:", np.array(accuracy_eclf).mean())
print("Precision:", np.array(precision_eclf).mean())
print("Recall:", np.array(recall_eclf).mean())
print("F1-Score:", np.array(f1_eclf).mean())
print("Train time:", traintime_eclf / 4.0)
print("Test time:", testtime_eclf)

Voting
Accuracy: 0.9999346808431708
Precision: 0.9998755444928438
Recall: 0.9999377916018662
F1-Score: 0.9999066535270071
Train time: 34.85430550575256
Test time: 1.0033175945281982
