In [13]:
import warnings
from tqdm import tqdm
import numpy as np
import pandas as pd
import xgboost
import lightgbm
import catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score,precision_score,recall_score,accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import gc
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif']=['Simhei']
plt.rcParams['axes.unicode_minus']=False
import json
import jieba
from scipy.stats import entropy
import fasttext

## 0 数据的简单分析

In [14]:
base_info=pd.read_csv('../train/base_info.csv')#企业的基本信息
annual_report_info=pd.read_csv('../train/annual_report_info.csv')#企业的年报基本信息
tax_info=pd.read_csv('../train/tax_info.csv')#企业的纳税信息
change_info=pd.read_csv('../train/change_info.csv')#变更信息
news_info=pd.read_csv('../train/news_info.csv')#舆情信息
other_info=pd.read_csv('../train/other_info.csv')#其它信息
entprise_info=pd.read_csv('../train/entprise_info.csv')#企业标注信息{0: 13884, 1: 981}
entprise_evaluate=pd.read_csv('../entprise_evaluate.csv')#未标注信息

print('base_info shape:',base_info.shape,'id unique:',len(base_info['id'].unique()))
print('annual_report_info shape:',annual_report_info.shape,'id unique:',len(annual_report_info['id'].unique()))
print('tax_info shape:',tax_info.shape,'id unique:',len(tax_info['id'].unique()))
print('change_info shape:',change_info.shape,'id unique:',len(change_info['id'].unique()))
print('news_info shape:',news_info.shape,'id unique:',len(news_info['id'].unique()))
print('other_info shape:',other_info.shape,'id unique:',len(other_info['id'].unique()))
print('entprise_info shape:',entprise_info.shape,'id unique:',len(entprise_info['id'].unique()))
print('entprise_evaluate shape:',entprise_evaluate.shape,'id unique:',len(entprise_evaluate['id'].unique()))

base_info shape: (24865, 33) id unique: 24865
annual_report_info shape: (22550, 23) id unique: 8937
tax_info shape: (29195, 9) id unique: 808
change_info shape: (45940, 5) id unique: 8726
news_info shape: (10518, 3) id unique: 927
other_info shape: (1890, 4) id unique: 1888
entprise_info shape: (14865, 2) id unique: 14865
entprise_evaluate shape: (10000, 2) id unique: 10000


## 1 特征构建 
###  tfidi处理经营范围(opscope)特征

In [15]:
# tfidif 处理经营范围的特征
#cn_stopwords.txt来源于 https://github.com/goto456/stopwords
def stopwordslist():
    stopwords = [line.strip() for line in open('D:/tianma/stopwords-master/cn_stopwords.txt',encoding='UTF-8').readlines()]
    return stopwords
# 创建一个停用词列表
stopwords = stopwordslist()
stopwords+=['、', '；', '，', '）','（']
#
train_df_scope=base_info.merge(entprise_info)[['id','opscope','label']]
test_df_scope=base_info[base_info['id'].isin(entprise_evaluate['id'].unique().tolist())]
test_df_scope=test_df_scope.reset_index(drop=True)[['id','opscope']]
str_label_0=''
str_label_1=''
for index,name,opscope,label in train_df_scope.itertuples():
    # 结巴分词
    seg_text = jieba.cut(opscope.replace("\t", " ").replace("\n", " "))
    outline = " ".join(seg_text)
    out_str=""
    for per in outline.split():
        if per not in stopwords: 
            out_str += per
            out_str+=" "
    if label==0:
        str_label_0+=out_str
    else:
        str_label_1+=out_str
corpus=[str_label_0,str_label_1]
vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf，第二个fit_transform是将文本转为词频矩阵
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语总共7175个词语
weight=tfidf.toarray()#将(2, 7175)tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重
# for i in range(len(weight)):#打印每类文本的tf-idf词语权重，第一个for遍历所有文本，第二个for便利某一类文本下的词语权重
#     #
#     for j in range(len(word)):
#         print(word[j],weight[i][j])
#下面将会根据tfidi算出来的权重将经营范围的文本特征转换为数值(利用weight[1,:]也即各个词语在第二类(违法类中所占据的权重之和))
illegal_word_weights={}
for i in range(len(word)):
    illegal_word_weights[word[i]]=weight[1][i]
tfidi_opscope=[]
for index,name,opscope in base_info[['id','opscope']].itertuples():
    # 
    seg_text = jieba.cut(opscope.replace("\t", " ").replace("\n", " "))
    outline = " ".join(seg_text)
    tfidi_frt=0
    for per in outline.split():
        if per in illegal_word_weights: 
            tfidi_frt+=illegal_word_weights[per]
    tfidi_opscope.append(tfidi_frt)
base_info['tfidif_opscope']=tfidi_opscope
print('对opscope提取tfidif特征完毕..........')

对opscope提取tfidif特征完毕..........


##  change_info、other_info，news_info，annual_report_info,tax表格的简单特征构建

In [16]:
#change_info
change_info_clean=change_info.drop(['bgrq','bgq','bgh'],axis=1)
change_info_clean = change_info_clean.groupby('id',sort=False).agg('mean')
change_info_clean=pd.DataFrame(change_info_clean).reset_index()
#other_info
#空值大于0.5的列都删除掉
buf_group = other_info.groupby('id',sort=False).agg('mean')
other_info_clean=pd.DataFrame(buf_group).reset_index()
other_info_clean=other_info_clean.fillna(-1)
other_info_clean = other_info_clean.groupby('id',sort=False).agg('mean')
other_info_clean=pd.DataFrame(other_info_clean).reset_index()
#news_info
news_info_clean=news_info.drop(['public_date'],axis=1)
#对object类型进行编码
news_info_clean['positive_negtive']=news_info_clean['positive_negtive'].fillna("中立")
#
dic={}
cate=news_info_clean.positive_negtive.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
#
news_info_clean['positive_negtive']=news_info_clean['positive_negtive'].map(dic)
news_info_clean = news_info_clean.groupby('id',sort=False).agg('mean')
news_info_clean=pd.DataFrame(news_info_clean).reset_index()
#处理annual_report_info的数据
#空值大于0.5的列都删除掉
annual_report_info_clean=annual_report_info.dropna(thresh=annual_report_info.shape[0]*0.5,how='all',axis=1)
#对object类型进行编码
annual_report_info_clean['BUSSTNAME']=annual_report_info_clean['BUSSTNAME'].fillna("无")
dic = {'无':-1,'开业':0, '歇业':1, '停业':2, '清算':3}
#
annual_report_info_clean['BUSSTNAME']=annual_report_info_clean['BUSSTNAME'].map(dic)
annual_report_info_clean = annual_report_info_clean.groupby('id',sort=False).agg('mean')
annual_report_info_clean=pd.DataFrame(annual_report_info_clean).reset_index()
#处理tax数据
tax_info_clean=tax_info.copy()
tax_info_clean['START_DATE']=pd.to_datetime(tax_info_clean['START_DATE'])
tax_info_clean['END_DATE']=pd.to_datetime(tax_info_clean['END_DATE'])
tax_info_clean['gap_day']=(tax_info_clean['END_DATE']-tax_info_clean['START_DATE']).dt.total_seconds()//3600//24
tax_info_clean=tax_info_clean.drop(['START_DATE','END_DATE'],axis=1)
tax_info_clean['TAX_CATEGORIES']=tax_info_clean['TAX_CATEGORIES'].fillna("无")#17 unique
tax_info_clean['TAX_ITEMS']=tax_info_clean['TAX_ITEMS'].fillna("无")#275 TAX_ITEMS
#对object类型进行编码
dic={}
cate=tax_info_clean.TAX_CATEGORIES.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
tax_info_clean['TAX_CATEGORIES']=tax_info_clean['TAX_CATEGORIES'].map(dic)
#
dic={}
cate=tax_info_clean.TAX_ITEMS.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
tax_info_clean['TAX_ITEMS']=tax_info_clean['TAX_ITEMS'].map(dic)
tax_info_clean['income']=tax_info_clean['TAX_AMOUNT']/tax_info_clean['TAX_RATE']
#
tax_info_clean = tax_info_clean.groupby('id',sort=False).agg('mean')
tax_info_clean=pd.DataFrame(tax_info_clean).reset_index()
#税额分箱
tax_info_clean['TAX_AMOUNT']=tax_info_clean['TAX_AMOUNT'].fillna(tax_info_clean['TAX_AMOUNT'].median())
tax_info_clean['bucket_TAX_AMOUNT']=pd.qcut(tax_info_clean['TAX_AMOUNT'], 10, labels=False,duplicates='drop')
print('finished .............')

finished .............


## base_info数据较为重要，需要构建诸多交叉特征以及特征分箱

In [17]:
# #处理base_info数据
base_info['opto']=pd.to_datetime(base_info['opto']).fillna(pd.to_datetime(base_info['opto']).max())
base_info_clean=base_info.drop(['opscope','opto'],axis=1)
base_info_clean['opfrom']=pd.to_datetime(base_info_clean['opfrom'])
current_time=pd.to_datetime('2020-11-17 00:00:00')
base_info_clean['alive_year']=(current_time-pd.to_datetime(base_info_clean['opfrom'])).dt.total_seconds()//3600//24//30
base_info_clean.drop(['opfrom'],axis=1,inplace=True)
#............................对object类型进行编码...............................
base_info_clean['industryphy']=base_info_clean['industryphy'].fillna("无")
base_info_clean['dom']=base_info_clean['dom'].fillna("无")
base_info_clean['opform']=base_info_clean['opform'].fillna("无")
base_info_clean['oploc']=base_info_clean['oploc'].fillna("无")
#
dic={}
cate=base_info_clean.industryphy.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
base_info_clean['industryphy']=base_info_clean['industryphy'].map(dic)
#
dic={}
cate=base_info_clean.dom.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
base_info_clean['dom']=base_info_clean['dom'].map(dic)
#
dic={}
cate=base_info_clean.opform.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
base_info_clean['opform']=base_info_clean['opform'].map(dic)
#
dic={}
cate=base_info_clean.oploc.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
base_info_clean['oploc']=base_info_clean['oploc'].map(dic)
#
base_info_clean=base_info_clean.fillna(-1)
#
print('编码完毕.................')
#........................分箱.................................
def bucket(name,bucket_len):
    gap_list=[base_info_clean[name].quantile(i/bucket_len) for i in range(bucket_len+1)]#以分位数作为分箱标志
    len_data=len(base_info_clean[name])
    new_col=[]
    for i in base_info_clean[name].values:
        for j in range(len(gap_list)):
            if gap_list[j]>=i:
                encode=j
                break
        new_col.append(encode)
    return new_col
#注册资本_实缴资本
base_info_clean['regcap_reccap']=base_info_clean['regcap']-base_info_clean['reccap']
#注册资本分箱
base_info_clean['regcap']=base_info_clean['regcap'].fillna(base_info_clean['regcap'].median())
base_info_clean['bucket_regcap']=pd.qcut(base_info_clean['regcap'], 10, labels=False,duplicates='drop')
#实缴资本分箱
base_info_clean['reccap']=base_info_clean['reccap'].fillna(base_info_clean['reccap'].median())
base_info_clean['bucket_reccap']=pd.qcut(base_info_clean['reccap'], 10, labels=False,duplicates='drop')
#注册资本_实缴资本分箱
base_info_clean['regcap_reccap']=base_info_clean['regcap_reccap'].fillna(base_info_clean['regcap_reccap'].median())
base_info_clean['bucket_regcap_reccap']=pd.qcut(base_info_clean['regcap_reccap'], 10, labels=False,duplicates='drop')
#.............................交叉.........................
#作两个特征的交叉
def cross_two(name_1,name_2):
    new_col=[]
    encode=0
    dic={}
    val_1=base_info_clean[name_1]
    val_2=base_info_clean[name_2]
    for i in tqdm(range(len(val_1))):
        tmp=str(val_1[i])+'_'+str(val_2[i])
        if tmp in dic:
            new_col.append(dic[tmp])
        else:
            dic[tmp]=encode
            new_col.append(encode)
            encode+=1
    return new_col
#作企业类型-小类的交叉特征
base_info_clean['enttypegb']=base_info_clean['enttypegb'].fillna("无")
base_info_clean['enttypeitem']=base_info_clean['enttypeitem'].fillna("无")
new_col=cross_two('enttypegb','enttypeitem')#作企业类型-小类的交叉特征
base_info_clean['enttypegb_enttypeitem']=new_col
#
#行业类别-细类的交叉特征
base_info_clean['industryphy']=base_info_clean['industryphy'].fillna("无")
base_info_clean['industryco']=base_info_clean['industryco'].fillna("无")
new_col=cross_two('industryphy','industryco')#作企业类型-小类的交叉特征
base_info_clean['industryphy_industryco']=new_col
#企业类型-行业类别的交叉特征
new_col=cross_two('enttypegb','industryphy')#作企业类型-小类的交叉特征
base_info_clean['enttypegb_industryphy']=new_col
#行业类别-企业类型小类的交叉特征
new_col=cross_two('industryphy','enttypeitem')#作企业类型-小类的交叉特征
base_info_clean['industryphy_enttypeitem']=new_col
#行业类别细类--企业类型小类的交叉特征
new_col=cross_two('industryco','enttypeitem')#作企业类型-小类的交叉特征
base_info_clean['industryco_enttypeitem']=new_col

#企业类型-小类-行业类别-细类的交叉特征
new_col=cross_two('enttypegb_enttypeitem','industryphy_industryco')#作企业类型-小类的交叉特征
base_info_clean['enttypegb_enttypeitem_industryphy_industryco']=new_col
base_info_clean.shape

  0%|                                                                                        | 0/24865 [00:00<?, ?it/s]

编码完毕.................


100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 164022.43it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 158843.45it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 159817.11it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 161892.84it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 165650.71it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 164023.21it/s]


(24865, 42)

## category特征单独提取出来

In [18]:
cat_features=['industryphy','dom','opform','oploc','bucket_regcap',
              'bucket_reccap','bucket_regcap_reccap',
              'enttypegb','enttypeitem','enttypegb_enttypeitem',
              'enttypegb_industryphy','enttypegb_enttypeitem_industryphy_industryco',
              'industryphy','industryco','industryphy_industryco',
              'industryphy_enttypeitem','industryco_enttypeitem',
              'adbusign','townsign','regtype','TAX_CATEGORIES','bucket_TAX_AMOUNT',
              'legal_judgment_num','brand_num','patent_num','positive_negtive'
             ]


In [19]:
#
all_data=base_info_clean.merge(annual_report_info_clean,how='outer')
all_data=all_data.merge(tax_info_clean,how='outer')
all_data=all_data.merge(change_info_clean,how='outer')
all_data=all_data.merge(news_info_clean,how='outer')
all_data=all_data.merge(other_info_clean,how='outer')
all_data=all_data.fillna(-1)
all_data[cat_features]=all_data[cat_features].astype(int)
all_data.shape#,base_info.shape,annual_report_info.shape,tax_info.shape

(24865, 73)

In [20]:
#
train_df=all_data.merge(entprise_info)
train_data=train_df.drop(['id','label'],axis=1)
kind=train_df['label']
test_df=all_data[all_data['id'].isin(entprise_evaluate['id'].unique().tolist())]
test_df=test_df.reset_index(drop=True)
test_data=test_df.drop(['id'],axis=1)
train_data.shape,test_data.shape

((14865, 72), (10000, 72))

In [21]:
# # 伪标签
# use_pseudo=True
# if use_pseudo:
#     train_data=train_df.drop(['id','label'],axis=1)
#     kind=train_df['label']
#     pseudo_name=[]
#     pseudo_label={'id':[],'label':[]}
#     pseudo_df=pd.read_csv('submit_857_xgb_rf_lgb_cab.csv')
#     for index,name,score in pseudo_df.itertuples():
#         if score>0.9 or score<0.05:
#             pseudo_label['id'].append(name)
#             if score>0.9 :
#                 pseudo_label['label'].append(1)
#             else:
#                 pseudo_label['label'].append(0)
#             pseudo_name.append(name)
#     len(pseudo_name)
#     pseudo_data=test_df[test_df.id.isin(pseudo_name)].reset_index(drop=True)
#     pseudo_data=pseudo_data.merge(pd.DataFrame(pseudo_label))
#     #
#     train_df=pd.concat((train_df,pseudo_data)).reset_index(drop=True)
#     train_data=train_df.drop(['id','label'],axis=1)
#     kind=train_df['label']
#     print(train_data.shape,test_data.shape)
    

In [22]:
#特征筛选
# frt_select=[
#  'industryphy',
#  'enttypegb',
#  'regcap',
#  'townsign',
#  'industryco',
#  'bucket_regcap',
#  'empnum',
#  'bucket_reccap',
#  'enttypeitem',
#  'industryphy_industryco',
#  'reccap',
#  'FORINVESTSIGN',
#  'positive_negtive',
#  'regtype',
#  'STOCKTRANSIGN',
#  'bucket_regcap_reccap',
#  'enttypegb_enttypeitem',
#  'regcap_reccap',
#  'legal_judgment_num',
#  'TAX_CATEGORIES',
#  'TAX_AMOUNT',
#  'bgq_bgh',
#  'TAX_ITEMS']
# frt_select=important_frt[:30]
# train_data=train_data[frt_select]
# test_data=test_data[frt_select]
# cat_features=list(set(frt_select).intersection(set(cat_features)))
# cat_features

In [23]:
# def eval_score(y_test,y_pre):
#     _,_,f_class,_=precision_recall_fscore_support(y_true=y_test,y_pred=y_pre,labels=[0,1],average=None)
#     fper_class={'合法':f_class[0],'违法':f_class[1],'f1':f1_score(y_test,y_pre)}
#     return fper_class
#
def eval_score(y_test,y_pre):
    valid_f1 = f1_score(y_test, y_pre)
    valid_p = precision_score(y_test, y_pre)
    valid_r = recall_score(y_test, y_pre)
    F = valid_p*0.7+valid_r*0.2+valid_f1*0.1
    return F

#
def k_fold_serachParmaters(model,train_val_data,train_val_kind):
    mean_f1=0
    mean_f1Train=0
    n_splits=5
    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
    for train, test in sk.split(train_val_data, train_val_kind):
        x_train = train_val_data.iloc[train]
        y_train = train_val_kind.iloc[train]
        x_test = train_val_data.iloc[test]
        y_test = train_val_kind.iloc[test]

        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        weigted_score =  eval_score(y_test,pred)
        mean_f1+=weigted_score/n_splits
        #print(fper_class)
        
        pred_Train = model.predict(x_train)
        weigted_score_train =  eval_score(y_train,pred_Train)
        mean_f1Train+=weigted_score_train/n_splits
    #print('mean valf1:',mean_f1)
    #print('mean trainf1:',mean_f1Train)
    return mean_f1

In [26]:
def search_param(n_estimators,lr,max_depth,num_leaves):
    params={'num_leaves':num_leaves
                   ,'max_depth':max_depth
                   ,'learning_rate':lr
                   ,'n_estimators':n_estimators
                   ,'n_jobs':8
               }
    mean_f1=k_fold_serachParmaters(lightgbm.LGBMClassifier(**params),train_data,kind)
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [150]:
    print('n_estimators:',n_estimators)
    for lr in [0.06,]:
        for max_depth in [7,8,9,]:
            for num_leaves in [7,9,11,13]:
                mean_f1=search_param(n_estimators,lr,max_depth,num_leaves)
                if mean_f1>best:
                    param=[n_estimators,lr,max_depth,num_leaves]
                    best=mean_f1
                    print(param,best)#[150, 0.07, 13, 11] 0.8370422016216938

n_estimators: 150
[150, 0.06, 7, 7] 0.8157825742455604
[150, 0.06, 7, 9] 0.8206371265625654
[150, 0.06, 8, 9] 0.823178846036303


In [29]:
#
def search_param(n_estimators,lr,max_depth,min_child_weight):
    params={'max_depth':max_depth
              ,'learning_rate':lr
              ,'n_estimators':n_estimators
              ,'min_child_weight':min_child_weight
              ,'n_jobs':8
              ,'importance_type':'total_cover'
           }
    
    mean_f1=k_fold_serachParmaters(xgboost.XGBClassifier(**params),train_data,kind)
    return mean_f1

#搜索最佳参数
param=[]
# best=0
# for n_estimators in [50]:
#     print('n_estimators:',n_estimators)
#     for lr in [0.05,0.07,0.09,0.1]:
#         for max_depth in [7,8,9,10]:
#             for min_child_weight in [7,9,11,13,15]:
#                 mean_f1=search_param(n_estimators,lr,max_depth,min_child_weight)
#                 if mean_f1>best:
#                     param=[n_estimators,lr,max_depth,min_child_weight]
#                     best=mean_f1
#                     print(param,best)#[50, 0.04, 4, 13] 0.8449774193716448

In [30]:
def search_param(iter_cnt,lr,max_depth):
    
    params={'iterations':iter_cnt
              ,'learning_rate':lr
              ,'depth':max_depth
              ,'silent':True
              ,'thread_count':8
              ,'task_type':'CPU'
        }
    mean_f1=k_fold_serachParmaters(catboost.CatBoostClassifier(**params),train_data,kind)
    return mean_f1

#搜索最佳参数[52, 0.073, 7] 0.8440488273317126
param=[]
best=0
for iter_cnt in [100]:
    print('iter_cnt:',iter_cnt)
    for lr in [0.08,0.1]:
        for max_depth in [8]:
            mean_f1=search_param(iter_cnt,lr,max_depth)
            if mean_f1>best:
                param=[iter_cnt,lr,max_depth]#[55, 0.076, 7]
                best=mean_f1
                print(param,best)
print(param,best)#筛选特征:[70, 0.06 8] 0.8417684642475657 所有特征:[54, 0.07, 7] 0.8411337269934891


iter_cnt: 100
[100, 0.08, 8] 0.8209030188770057
[100, 0.1, 8] 0.8312103438202738
[100, 0.1, 8] 0.8312103438202738


In [31]:
rf_v1_params = {
    'n_jobs': -1,
    'n_estimators': 50,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 7,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0,
    'random_state':2020,
}
rf_v2_params = {
    'oob_score':True,
    'random_state':2020,
    'n_estimators': 70,
    'max_depth':13,
    'min_samples_split':5
}
# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':60,
    'max_features': 0.5,
    'max_depth': 20,
    'min_samples_leaf': 1,
    'verbose': 0,
    'random_state':2020,
}


# Gradient Boosting parameters
gb_params = {
    'n_estimators': 60,
    'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0,
    'random_state':2020,
}
#
lgb_params = {'num_leaves':9
                ,'max_depth':8
                ,'learning_rate':0.05
                ,'n_estimators':150
                ,'n_jobs':8
             }
xgb_params ={'max_depth':7
              ,'learning_rate':0.05
              ,'n_estimators':55
              ,'reg_alpha':0.005
              ,'n_jobs':8
              ,'importance_type':'total_cover'
            }
cab_params={'iterations':60
          ,'learning_rate':0.05
          ,'depth':10
          ,'silent':True
          ,'thread_count':8
          ,'task_type':'CPU'
          #,'cat_features':cat_features
}
#
rf_v1 = RandomForestClassifier(**rf_v1_params)
print("RandomForestClassifier_v1:",k_fold_serachParmaters(rf_v1,train_data,kind))
rf_v2 = RandomForestClassifier(**rf_v2_params)
print("RandomForestClassifier_v2:",k_fold_serachParmaters(rf_v2,train_data,kind))
et = ExtraTreesClassifier(**et_params)
print("ExtraTreesClassifier:",k_fold_serachParmaters(et,train_data,kind))
gb = GradientBoostingClassifier(**gb_params)
print("GradientBoostingClassifier:",k_fold_serachParmaters(gb,train_data,kind))
lgb=lightgbm.LGBMClassifier(**lgb_params)
print('LGBMClassifier:',k_fold_serachParmaters(lgb,train_data,kind))
xgb=xgboost.XGBClassifier(**xgb_params)
print('XGBClassifier:',k_fold_serachParmaters(xgb,train_data,kind))
cab=catboost.CatBoostClassifier(**cab_params)
print('CatBoostClassifier:',k_fold_serachParmaters(cab,train_data,kind))

RandomForestClassifier_v1: 0.8499826928084713
RandomForestClassifier_v2: 0.8244763507490608
ExtraTreesClassifier: 0.8257472968425423
GradientBoostingClassifier: 0.8252509150745929
LGBMClassifier: 0.8242357999459483
XGBClassifier: 0.8230064578597459
CatBoostClassifier: 0.8313917650438513


In [32]:
#
def cnt_result(xx):
    cnt_re={0:0,1:0}
    for a in xx:
        if a<=0.5:
            cnt_re[0]+=1
        else:
            cnt_re[1]+=1
    return cnt_re
#

# 两层stacking

In [35]:
# 
use_selectFrt=False
ntrain = train_df.shape[0]
ntest = test_df.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
#kf = KFold(n_splits= NFOLDS, random_state=SEED)
kf=StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2020)
print("-------->>>>>定义一个Sklearn classifier 的扩展<<<<<--------")
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)[:,1]
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
#"-----------------定义oof:stacking的核心流程-------------"
print("-------->>>>>定义oof:stacking的核心流程<<<<<--------")
'''
'''
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))#(14865,)
    oof_test = np.zeros((ntest,))#(10000,)
    oof_test_skf = np.empty((NFOLDS, ntest))#(5, 10000)
    #
    fold=0
    for train_index, test_index in kf.split(x_train, y_train):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]#该折余下的验证集

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)#对该折余下的验证集做预测，将结果填补在这些数据在原始数据中的位置
        oof_test_skf[fold, :] = clf.predict_proba(x_test)#用此时的模型（第i折的模型）对测试做预测，放在第i折对应的位置
        #oof_train[test_index] = clf.predict(x_te)
        #oof_test_skf[fold, :] = clf.predict(x_test)
        fold+=1

    oof_test[:] = oof_test_skf.mean(axis=0)#将这N折模型对测试集的预测结果进行一个平均，作为改模型的预测结果
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
#
'''
第一层的基模型
'''
# Put in our parameters for said classifiers
# Random Forest parameters
# 
print("-------->>>>>第一层的基模型<<<<<--------")
rf_v1 = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_v1_params)
rf_v2 = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_v2_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
lgb= SklearnHelper(clf=lightgbm.LGBMClassifier, seed=SEED, params=lgb_params)
xgb = SklearnHelper(clf=xgboost.XGBClassifier, seed=SEED, params=xgb_params)
cab=SklearnHelper(clf=catboost.CatBoostClassifier, seed=SEED, params=cab_params)

#
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
#
train_df=all_data.merge(entprise_info)
test_df=all_data[all_data['id'].isin(entprise_evaluate['id'].unique().tolist())]
test_df=test_df.reset_index(drop=True)
y_train = train_df['label'].ravel()
train_data = train_df.drop(['id','label'], axis=1)
test_data=test_df.drop(['id'],axis=1)
if use_selectFrt:
    select_frt=important_frt[:30]
    train_data=train_data[select_frt]
    test_data=test_data[select_frt]
x_train = train_data.values # Creates an array of the train data
x_test = test_data.values # Creats an array of the test data
#
print("-------->>>>>训练、测试数据<<<<<--------")
print("y_train:{};train_data:{};test_data:{}".format(y_train.shape,train_data.shape,test_data.shape))
#
# Create our OOF train and test predictions. These base results will be used as new features
print("-------->>>>>第一阶段训练oof<<<<<--------")
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
print("Extra Trees.............")
rf_oof_train_v1, rf_oof_test_v1 = get_oof(rf_v1,x_train, y_train, x_test) # Random Forest
print("Random Forest_v1.............")
rf_oof_train_v2, rf_oof_test_v2 = get_oof(rf_v2,x_train, y_train, x_test) # Random Forest
print("Random Forest_v2.............")
lgb_oof_train, lgb_oof_test = get_oof(lgb,x_train, y_train, x_test) # Gradient Boost
print("LGBClassifier.............")
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
print("Gradient.............")
xgb_oof_train, xgb_oof_test = get_oof(xgb,x_train, y_train, x_test) # XGBClassifier
print("XGBClassifier.............")
cab_oof_train, cab_oof_test = get_oof(cab,x_train, y_train, x_test)# CatClassifier
print("CatClassifier.............")

print("Training Stage_1 is complete")

-------->>>>>定义一个Sklearn classifier 的扩展<<<<<--------
-------->>>>>定义oof:stacking的核心流程<<<<<--------
-------->>>>>第一层的基模型<<<<<--------
-------->>>>>训练、测试数据<<<<<--------
y_train:(14865,);train_data:(14865, 72);test_data:(10000, 72)
-------->>>>>第一阶段训练oof<<<<<--------
Extra Trees.............
Random Forest.............
Random Forest_v2.............
LGBClassifier.............
Gradient.............
XGBClassifier.............
CatClassifier.............
Training Stage_1 is complete


In [37]:
# #
x_train = np.concatenate(( et_oof_train, rf_oof_train_v1, rf_oof_train_v2, gb_oof_train, lgb_oof_train, xgb_oof_train,cab_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test_v1,rf_oof_test_v2, gb_oof_test,lgb_oof_test, xgb_oof_test, cab_oof_test), axis=1)
#

In [38]:
def search_param(n_estimators,lr,max_depth,num_leaves):
    params={'num_leaves':num_leaves
                   ,'max_depth':max_depth
                   ,'learning_rate':lr
                   ,'n_estimators':n_estimators
                   ,'n_jobs':8
               }
    mean_f1=k_fold_serachParmaters(lightgbm.LGBMClassifier(**params),pd.DataFrame(x_train),pd.DataFrame(y_train))
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [80]:
    print('n_estimators:',n_estimators)
    for lr in [0.04,0.05,0.06]:
        for max_depth in [4,5,6,7,8]:
            for num_leaves in [4,5,6,7,8]:
                mean_f1=search_param(n_estimators,lr,max_depth,num_leaves)
                if mean_f1>best:
                    param=[n_estimators,lr,max_depth,num_leaves]
                    best=mean_f1
                    print(param,best)#[80, 0.05, 4, 4] 0.8419565786981478

n_estimators: 80
[80, 0.04, 4, 4] 0.8516515249707891
[80, 0.04, 4, 5] 0.8518063396655999
[80, 0.04, 4, 6] 0.854340990298924
[80, 0.04, 4, 8] 0.8555600899834916
[80, 0.04, 5, 8] 0.8561099582451429


In [39]:
def search_param(n_estimators,lr,max_depth,min_child_weight):
    params={'max_depth':max_depth
              ,'learning_rate':lr
              ,'n_estimators':n_estimators
              ,'min_child_weight':min_child_weight
              ,'n_jobs':8
              ,'importance_type':'total_cover'
           }
    
    mean_f1=k_fold_serachParmaters(xgboost.XGBClassifier(**params),pd.DataFrame(x_train),pd.DataFrame(y_train))
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [50]:
    print('n_estimators:',n_estimators)
    for lr in [0.03,0.04]:
        for max_depth in [3,4,5,6]:
            for min_child_weight in [5,7,9,11,13,15]:
                mean_f1=search_param(n_estimators,lr,max_depth,min_child_weight)
                if mean_f1>best:
                    param=[n_estimators,lr,max_depth,min_child_weight]
                    best=mean_f1
                    print(param,best)#[50, 0.04, 4, 13] 0.8449774193716448

n_estimators: 50
[50, 0.03, 3, 5] 0.8521864513401066
[50, 0.03, 3, 11] 0.8523042631007035
[50, 0.03, 3, 13] 0.8537931384951206
[50, 0.03, 6, 13] 0.8558096475945506


In [43]:
def search_param(iter_cnt,lr,max_depth):
    
    params={'iterations':iter_cnt
              ,'learning_rate':lr
              ,'depth':max_depth
              ,'silent':True
              ,'thread_count':8
              ,'task_type':'CPU'
        }
    mean_f1=k_fold_serachParmaters(catboost.CatBoostClassifier(**params),pd.DataFrame(x_train),pd.DataFrame(y_train))
    return mean_f1

#搜索最佳参数[52, 0.073, 7] 0.8440488273317126
param=[]
best=0
for iter_cnt in [100]:
    print('iter_cnt:',iter_cnt)
    for lr in [0.03,0.04,0.05]:
        for max_depth in [7,8,6]:
            mean_f1=search_param(iter_cnt,lr,max_depth)
            if mean_f1>best:
                param=[iter_cnt,lr,max_depth]#[55, 0.04, 5] 0.8475966668074264
                best=mean_f1
                print(param,best)
print(param,best)#[55, 0.04, 5] 0.8475966668074264


iter_cnt: 100
[100, 0.03, 7] 0.8480191309321614
[100, 0.03, 8] 0.8481074193605483
[100, 0.03, 6] 0.8492033735787374
[100, 0.04, 7] 0.8506759802228252
[100, 0.04, 8] 0.8509362237864766
[100, 0.05, 7] 0.8543198637162136
[100, 0.05, 6] 0.8546795944699905
[100, 0.05, 6] 0.8546795944699905


In [44]:
# #
def search_param(n_estimators,max_depth,min_samples_split):
    params={'oob_score':True,
            'random_state':2020,
            'n_estimators': n_estimators,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split
            }
    mean_f1=k_fold_serachParmaters(RandomForestClassifier(**params),pd.DataFrame(x_train),pd.DataFrame(y_train))
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [40,50]:
    print('n_estimators:',n_estimators)
    for min_samples_split in [4,5,7,8]:
        for max_depth in [4,5,7,9,]:
            mean_f1=search_param(n_estimators,max_depth,min_samples_split)
            if mean_f1>best:
                param=[n_estimators,max_depth,min_samples_split]
                best=mean_f1
                print(param,best)#[50, 4, 8] 0.8442795786278773



n_estimators: 40
[40, 4, 4] 0.8500048665554116
[40, 7, 4] 0.8514957600585706
[40, 9, 4] 0.8530427606351101
n_estimators: 50
[50, 9, 7] 0.8541367972490226


# stage-2

In [45]:
#stage_2的模型调参

lgb_params_stage2 = {'num_leaves':8
                ,'max_depth':5
                ,'learning_rate':0.04
                ,'n_estimators':80
                ,'n_jobs':8
             }
xgb_params_stage2 ={'max_depth':6
              ,'learning_rate':0.03
              ,'n_estimators':50
              ,'min_child_weight':13
              ,'n_jobs':8
              ,'importance_type':'total_cover'
            }
cab_params_stage2={'iterations':100
          ,'learning_rate':0.05
          ,'depth':6
          ,'silent':True
          ,'thread_count':8
          ,'task_type':'CPU'
          #,'cat_features':cat_features
}
rf_params_stage2={'oob_score':True,
            'random_state':2020,
            'n_estimators': 50,
            'max_depth':9,
            'min_samples_split':7
            }
# #
x_train = np.concatenate(( et_oof_train, rf_oof_train_v1, rf_oof_train_v2, gb_oof_train, lgb_oof_train, xgb_oof_train,cab_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test_v1,rf_oof_test_v2, gb_oof_test,lgb_oof_test, xgb_oof_test, cab_oof_test), axis=1)
#
lgb_stage2=lightgbm.LGBMClassifier(**lgb_params_stage2)
print('LGBMClassifier:',k_fold_serachParmaters(lgb_stage2,pd.DataFrame(x_train),pd.DataFrame(y_train)))
xgb_stage2=xgboost.XGBClassifier(**xgb_params_stage2)
print('XGBClassifier:',k_fold_serachParmaters(xgb_stage2,pd.DataFrame(x_train),pd.DataFrame(y_train)))
cab_stage2=catboost.CatBoostClassifier(**cab_params_stage2)
print('CatBoostClassifier:',k_fold_serachParmaters(cab_stage2,pd.DataFrame(x_train),pd.DataFrame(y_train)))
rf_stage2 = RandomForestClassifier(**rf_params_stage2)
print("RandomForestClassifier_v1:",k_fold_serachParmaters(rf_stage2,pd.DataFrame(x_train),pd.DataFrame(y_train)))

LGBMClassifier: 0.8561099582451429
XGBClassifier: 0.8558096475945506
CatBoostClassifier: 0.8546795944699905
RandomForestClassifier_v1: 0.8541367972490226


# 训练stage2

In [49]:
print("Training is complete")
print("------第二阶段训练开始--------")
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, xgb_oof_train,cab_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test,gb_oof_test, xgb_oof_test, cab_oof_test), axis=1)

#
predictions_stage2=[]
for model_two_stage in [lgb_stage2,xgb_stage2,cab_stage2,rf_stage2]:
    #
    model_two_stage.fit(x_train, y_train)
    predictions = model_two_stage.predict_proba(x_test)[:,1]
    predictions_stage2.append(predictions)
    cnt_re=cnt_result(predictions)
    print("预测结果中合法的数量%d;违法的数量%d,合法/违法%f"%(cnt_re[0],cnt_re[1],cnt_re[0]/cnt_re[1]))
#
predictions=sum(predictions_stage2)/len(predictions_stage2)#
#predictions=np.sqrt(sum(np.array(np.array(predictions_stage2))**2)/len(predictions_stage2))#平方平均
#predictions=pow(np.prod(np.array(predictions_stage2), axis=0),1/len(predictions_stage2))#几何平均
cnt_re=cnt_result(predictions)
print("最终预测结果中合法的数量%d;违法的数量%d,合法/违法%f"%(cnt_re[0],cnt_re[1],cnt_re[0]/cnt_re[1]))

Training is complete
------第二阶段训练开始--------
预测结果中合法的数量9109;违法的数量891,合法/违法10.223345
预测结果中合法的数量9104;违法的数量896,合法/违法10.160714
预测结果中合法的数量9094;违法的数量906,合法/违法10.037528
预测结果中合法的数量9093;违法的数量907,合法/违法10.025358
最终预测结果中合法的数量9104;违法的数量896,合法/违法10.160714


In [50]:
votes=[(pre>0.5).astype(int) for pre in predictions_stage2]
vote_most=[]
for i in range(len(predictions_stage2[0])):
    vote_list=np.array(votes)[:,i]
    tmp = {0: 0, 1: 0}
    for k in vote_list:
        tmp[k] += 1
    #
    most = sorted(tmp.items(), key=lambda item: item[1])[-1][0]
    vote_most.append(most)
cnt_re=cnt_result(vote_most)
print("投票结果中合法的数量%d;违法的数量%d,合法/违法%f"%(cnt_re[0],cnt_re[1],cnt_re[0]/cnt_re[1]))

投票结果中合法的数量9097;违法的数量903,合法/违法10.074197


In [51]:
accuracy_score((predictions>0.5).astype(int),vote_most)

0.9989

In [54]:

test_df['score']=vote_most#predictions#
submit_csv=test_df[['id','score']]
save_path='../submission/submit_stack_1120_'+str(cnt_re[1])+'.csv'
submit_csv.to_csv(save_path,index=False)
save_path

'../submission/submit_stack_1120_903.csv'

In [55]:
submit_csv

Unnamed: 0,id,score
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,0
1,da8691b210adb3f67820f5e0c87b337d63112cee52211888,0
2,9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17,0
3,f000950527a6feb6ed308bc4c7ae11276eab86480f8e03db,0
4,f000950527a6feb617e8d6ca7025dcf9d765429969122069,0
...,...,...
9995,f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325,0
9996,f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda,1
9997,da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7,0
9998,516ab81418ed215dcbbf0614a7b929e691f8eed153d7bb31,0
