In [256]:
import warnings
from tqdm import tqdm
import numpy as np
import pandas as pd
import xgboost
import lightgbm
import catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score,precision_score,recall_score,accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import gc
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif']=['Simhei']
plt.rcParams['axes.unicode_minus']=False
import json
import jieba
from scipy.stats import entropy
import fasttext


## WOE编码
要对一个变量进行WOE编码，需要首先把这个变量进行分组处理（也叫离散化、分箱等等）

WOE_i=ln(py_i)/ln(pn_i)=ln(sum(y_pos)/y_T)/ln(sum(y_neg)/y_T)

WOE表示的实际上是“当前分组中响应客户占所有响应客户的比例”和“当前分组中没有响应的客户占所有没有响应的客户的比例”的差异。

WOE也可以这么理解，他表示的是当前这个组中响应的客户和未响应客户的比值，和所有样本中这个比值的差异。

这个差异是用这两个比值的比值，再取对数来表示的。

WOE越大，这种差异越大，这个分组里的样本响应的可能性就越大，WOE越小，差异越小，这个分组里的样本响应的可能性就越小

目标编码

## Target encoding
是表示分类列的一种非常有效的方法，并且仅占用一个特征空间，也称为均值编码。

该列中的每个值都被该类别的平均目标值替代。这可以更直接地表示分类变量和目标变量之间的关系，

并且也是一种很受欢迎的技术方法（尤其是在 Kaggle 比赛中）。

## 留一法（Leave-one-out）编码
试图通过计算平均值（不包括当前行值）来弥补对 y 变量的依赖以及值的多样性。这使异常值的影响趋于平稳，并创建更多样化的编码值。


由于模型不仅要面对每个编码类的相同值，还要面对一个范围值，因此它可以更好地泛化

In [257]:
#该函数用于返回一个dataframe中哪些是object类型
def get_object_columns(df):
    return df.select_dtypes(include=["object"]).columns.tolist()
#该函数用于计算一个dataframe中每一列的空值比例,thresh为阈值，返回一个列名的列表
def get_nan_ratio(df,thresh):
    nan_dic=(df.isnull().sum()/len(base_info)).to_dict()
    nan_list=[]
    for key in nan_dic.keys():
        if nan_dic[key]>thresh:#缺失值高于thresh的列被返回
            nan_list.append(key)
    return nan_list,sorted( nan_dic.items(),key = lambda x:x[1],reverse = True)
#........................分箱.................................
def bucket(df,name,bucket_len):
    gap_list=[df[name].quantile(i/bucket_len) for i in range(bucket_len+1)]#以分位数作为分箱标志
    len_data=len(df[name])
    new_col=[]
    for i in df[name].values:
        for j in range(len(gap_list)):
            if gap_list[j]>=i:
                encode=j
                break
        new_col.append(encode)
    return new_col
#作两个特征的交叉
def cross_two(df,name_1,name_2):
    new_col=[]
    encode=0
    dic={}
    val_1=df[name_1]
    val_2=df[name_2]
    for i in tqdm(range(len(val_1))):
        tmp=str(val_1[i])+'_'+str(val_2[i])
        if tmp in dic:
            new_col.append(dic[tmp])
        else:
            dic[tmp]=encode
            new_col.append(encode)
            encode+=1
    return new_col
# 特征nunique分布
def nunique_info(df,cat_features):
    for cat_fea in cat_features:
        print(cat_fea + "的特征分布如下：")
        print("{}特征有个{}不同的值".format(cat_fea, df[cat_fea].nunique()))
        print(df[cat_fea].value_counts())

In [258]:
#
class Category_encoders_hepler(object):
    def __init__(self):
        pass
    #
    def encode(self,data_train,data_test,cols,mode):
        if mode=="catboost":
            encoder = CatBoostEncoder(cols=cols, 
                            handle_unknown='value',  
                            handle_missing='value',
                            ).fit(data_train[cols],data_train['label'])
        elif mode=="woe":
            encoder = WOEEncoder(cols=cols, 
                                handle_unknown='value',  
                                handle_missing='value',
                                ).fit(data_train[cols],data_train['label'])
        elif mode=="target":
            encoder = TargetEncoder(cols=cols, 
                            handle_unknown='value',  
                            handle_missing='value',
                            min_samples_leaf=10).fit(data_train[cols],data_train['label'])
            
        elif mode=="leave_one_out":
            encoder = LeaveOneOutEncoder(cols=cols, 
                            handle_unknown='value',  
                            handle_missing='value',
                           ).fit(data_train[cols],data_train['label'])
        elif mode=="ordinal":
            encoder = OrdinalEncoder(cols = cols, 
                         handle_unknown = 'value', 
                         handle_missing = 'value').fit(data_train[cols],None)
        
        suffix='_'+mode.split('_')[0]
        encoded_train = encoder.transform(data_train[cols]) # 转换训练集
        encoded_train.columns=[i+suffix for i in encoded_train.columns]
        data_train=pd.concat([data_train,encoded_train],axis=1)

        encoded_test = encoder.transform(data_test[cols]) # 转换测试集
        encoded_test.columns=[i+suffix for i in encoded_test.columns]
        data_test=pd.concat([data_test,encoded_test],axis=1)
        
        data_train=data_train.drop(cols,axis=1)
        data_test=data_test.drop(cols,axis=1)
        return data_train,data_test
#------一个验证编码正确性的小例子---------------
# encoder_tools=Category_encoders_hepler()
# example_train_df = pd.DataFrame(np.array([['male',10],['female', 20], ['male',10], 
#                        ['female',20],['female',15]]),
#              columns = ['Sex','Type'])
# example_train_df['label'] = np.array([False, True, True, False, False])

# # 随机生成一些测试集, 并有意让其包含未在训练集出现过的类别与缺失值
# example_test_df = pd.DataFrame(np.array([['female',20],['male', 20], ['others',15], 
#                        ['male',20],['female',40], ['male', 25]]),
#              columns = ['Sex','Type'])
# example_test_df.loc[4,'Type'] = np.nan
# train_new,test_new=encode_tool.encode(example_train_df,example_test_df,['Sex','Type'],mode="ordinal")



## 0 数据的简单分析

In [259]:
base_info=pd.read_csv('../train/base_info.csv')#企业的基本信息
annual_report_info=pd.read_csv('../train/annual_report_info.csv')#企业的年报基本信息
tax_info=pd.read_csv('../train/tax_info.csv')#企业的纳税信息
change_info=pd.read_csv('../train/change_info.csv')#变更信息
news_info=pd.read_csv('../train/news_info.csv')#舆情信息
other_info=pd.read_csv('../train/other_info.csv')#其它信息
entprise_info=pd.read_csv('../train/entprise_info.csv')#企业标注信息{0: 13884, 1: 981}
entprise_evaluate=pd.read_csv('../entprise_evaluate.csv')#未标注信息

print('base_info shape:',base_info.shape,'id unique:',len(base_info['id'].unique()))
print('annual_report_info shape:',annual_report_info.shape,'id unique:',len(annual_report_info['id'].unique()))
print('tax_info shape:',tax_info.shape,'id unique:',len(tax_info['id'].unique()))
print('change_info shape:',change_info.shape,'id unique:',len(change_info['id'].unique()))
print('news_info shape:',news_info.shape,'id unique:',len(news_info['id'].unique()))
print('other_info shape:',other_info.shape,'id unique:',len(other_info['id'].unique()))
print('entprise_info shape:',entprise_info.shape,'id unique:',len(entprise_info['id'].unique()))
print('entprise_evaluate shape:',entprise_evaluate.shape,'id unique:',len(entprise_evaluate['id'].unique()))

base_info shape: (24865, 33) id unique: 24865
annual_report_info shape: (22550, 23) id unique: 8937
tax_info shape: (29195, 9) id unique: 808
change_info shape: (45940, 5) id unique: 8726
news_info shape: (10518, 3) id unique: 927
other_info shape: (1890, 4) id unique: 1888
entprise_info shape: (14865, 2) id unique: 14865
entprise_evaluate shape: (10000, 2) id unique: 10000


## 1 特征构建 
###  tfidf处理经营范围(opscope)特征

In [260]:
# tfidif 处理经营范围的特征
#cn_stopwords.txt来源于 https://github.com/goto456/stopwords
# 创建一个停用词列表
def tfidif_frt():
    stopwords = stopwords = [line.strip() for line in open('D:/tianma/stopwords-master/cn_stopwords.txt',encoding='UTF-8').readlines()]
    stopwords+=['、', '；', '，', '）','（']
    #
    train_df_scope=base_info.merge(entprise_info)[['id','opscope','label']]
    test_df_scope=base_info[base_info['id'].isin(entprise_evaluate['id'].unique().tolist())]
    test_df_scope=test_df_scope.reset_index(drop=True)[['id','opscope']]
    str_label_0=''
    str_label_1=''
    for index,name,opscope,label in train_df_scope.itertuples():
        # 结巴分词
        seg_text = jieba.cut(opscope.replace("\t", " ").replace("\n", " "))
        outline = " ".join(seg_text)
        out_str=""
        for per in outline.split():
            if per not in stopwords: 
                out_str += per
                out_str+=" "
        if label==0:
            str_label_0+=out_str
        else:
            str_label_1+=out_str
    corpus=[str_label_0,str_label_1]
    vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
    transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
    tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf，第二个fit_transform是将文本转为词频矩阵
    word=vectorizer.get_feature_names()#获取词袋模型中的所有词语总共7175个词语
    weight=tfidf.toarray()#将(2, 7175)tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重
    # for i in range(len(weight)):#打印每类文本的tf-idf词语权重，第一个for遍历所有文本，第二个for便利某一类文本下的词语权重
    #     #
    #     for j in range(len(word)):
    #         print(word[j],weight[i][j])
    #下面将会根据tfidi算出来的权重将经营范围的文本特征转换为数值(利用weight[1,:]也即各个词语在第二类(违法类中所占据的权重之和))
    illegal_word_weights={}
    for i in range(len(word)):
        illegal_word_weights[word[i]]=weight[1][i]
    tfidi_opscope=[]
    for index,name,opscope in base_info[['id','opscope']].itertuples():
        # 
        seg_text = jieba.cut(opscope.replace("\t", " ").replace("\n", " "))
        outline = " ".join(seg_text)
        tfidi_frt=0
        for per in outline.split():
            if per in illegal_word_weights: 
                tfidi_frt+=illegal_word_weights[per]
        tfidi_opscope.append(tfidi_frt)
    return tfidi_opscope
base_info['tfidif_opscope']=tfidif_frt()
base_info.drop(['opscope'],axis=1,inplace=True)
print('对opscope提取tfidif特征完毕..........')

对opscope提取tfidif特征完毕..........


##  change_info、other_info，news_info，annual_report_info,tax表格的简单特征构建

- id:企业唯一标识(object)
- oplocdistrict:行政区划代码
- industryphy:行业类别代码(object)
- industryco:行业细类代码
- dom:经营地址(object)
- opscope:经营范围(object)
- enttype:企业类型
- enttypeitem:企业类型小类
- opfrom:经营期限起(object)
- opto:经营期限止(object)
- state:状态
- orgid:机构标识
- jobid:职位标识
- adbusign:是否广告经营
- townsign:是否城镇
- regtype:主题登记类型
- empnum:从业人数
- compform:组织形式
- parnum:合伙人数
- exenum:执行人数
- opform:经营方式(object)
- ptbusscope:兼营范围
- venind:风险行业
- enttypeminu:企业类型细类
- midpreindcode:中西部优势产业代码
- protype:项目类型
- oploc:经营场所(object)
- regcap:注册资本（金）
- reccap:实缴资本
- forreccap:实缴资本（外方）
- forregcap:注册资本（外方）
- congro:投资总额
- enttypegb:企业（机构）类型]

In [261]:
#可以看到，经营地址编码虽然不同，但是长度变化却很小.总共长度只有21种，并且都可以被16整除，前32位不同的只有412种
#对于id特征亦如此，长度都为48，所以可以先对id按16位进行切分
#pd.Series([len(i) for i in base_info['dom'].values]).unique()/16
base_info['id_prefix']=[per[:16] for per in base_info['id'].values]
base_info['dom_prefix']=[per[:32] for per in base_info['dom'].values]
base_info['id_dom']=base_info['id'].astype(str) + '_' + base_info['dom'].astype(str)
#-------------------------------------
#构建交叉特征
base_info['enttypegb']=base_info['enttypegb'].fillna("无")
base_info['enttype']=base_info['enttype'].astype(str)
base_info['enttype']=base_info['enttype'].fillna("无")
base_info['enttypeitem']=base_info['enttypeitem'].fillna("无")
base_info['industryphy']=base_info['industryphy'].fillna("无")
base_info['industryco']=base_info['industryco'].fillna("无")
#行业类别_行业细类
base_info['industryphy_industryco']=base_info['industryphy'].astype(str) + '_' + base_info['industryco'].astype(str)
#企业类型_企业类型小类
base_info['enttype_enttypeitem']=base_info['enttype'].astype(str) + '_' + base_info['enttypeitem'].astype(str)
#行业类别_企业类型
base_info['industryphy_enttype']=base_info['industryphy'].astype(str) + '_' + base_info['enttype'].astype(str)
#
base_info['enttype_enttypeitem_industryphy_industryco']=base_info['enttype_enttypeitem'].astype(str) + '_' + base_info['industryphy_industryco'].astype(str)
#
base_info['industryphy_id_prefix']=base_info['industryphy'].astype(str) + '_' + base_info['id_prefix'].astype(str)
base_info['enttype_id_prefix']=base_info['enttype'].astype(str) + '_' + base_info['id_prefix'].astype(str)
base_info['industryphy_industryco_id_prefix']=base_info['industryphy_industryco'].astype(str) + '_' + base_info['id_prefix'].astype(str)
base_info['enttype_enttypeitem_id_prefix']=base_info['enttype_enttypeitem'].astype(str) + '_' + base_info['id_prefix'].astype(str)
#
base_info['industryphy_dom_prefix']=base_info['industryphy'].astype(str) + '_' + base_info['dom_prefix'].astype(str)
base_info['enttype_dom_prefix']=base_info['enttype'].astype(str) + '_' + base_info['dom_prefix'].astype(str)
#
base_info['enttypegb_industryphy']=base_info['enttypegb'].astype(str) + '_' + base_info['industryphy'].astype(str)
base_info['enttypegb_enttype']=base_info['enttypegb'].astype(str) + '_' + base_info['enttype'].astype(str)
base_info['enttypegb_industryphy_enttype']=base_info['enttypegb'].astype(str) + '_' + base_info['industryphy_enttype'].astype(str)
#---------------------------
# #处理base_info数据
base_info['reccap']=base_info['reccap'].fillna(-2)
drop_nan_columns=get_nan_ratio(base_info,thresh=0.8)[0]#缺失值高于thresh=0.8直接删掉
base_info_clean=base_info.drop(drop_nan_columns,axis=1)
#'enttypeminu', 0.7076211542328574),企业类型细类
#  ('venind', 0.6606877136537301),风险行业
#  ('opto', 0.6450834506334204),
#  ('opform', 0.638045445405188),
#  ('compform', 0.5724512366780615),
base_info_clean['null_enttypeminu']=(base_info_clean['enttypeminu'].isnull()).astype(int)
base_info_clean['null_enttypeminu']=(base_info_clean['enttypeminu'].isnull()).astype(int)
base_info_clean['venind']=base_info_clean['venind'].fillna(-2)
base_info_clean['compform']=base_info_clean['compform'].fillna(-2)
#------------------
#
base_info_clean['opfrom']=pd.to_datetime(base_info_clean['opfrom'])
current_time=pd.to_datetime('2020-11-17 00:00:00')
base_info_clean['alive_year']=(current_time-pd.to_datetime(base_info_clean['opfrom'])).dt.total_seconds()//3600//24//30
base_info_clean.drop(['opfrom','opto','dom'],axis=1,inplace=True)
#
for i in get_object_columns(base_info_clean):
    print("{}的unique长度:{}".format(i,len(base_info_clean[i].unique())))

#--------------------------分箱---------------------
#注册资本减去实缴资本
base_info_clean['regcap_diff_reccap']=base_info_clean['regcap']-base_info_clean['reccap']
base_info_clean['regcap_div_reccap']=base_info_clean['reccap']/base_info_clean['regcap']
#注册资本分箱
base_info_clean['regcap']=base_info_clean['regcap'].fillna(base_info_clean['regcap'].median())
base_info_clean['bucket_regcap']=pd.qcut(base_info_clean['regcap'], 10, labels=False,duplicates='drop')
#实缴资本分箱
base_info_clean['reccap']=base_info_clean['reccap'].fillna(base_info_clean['reccap'].median())
base_info_clean['bucket_reccap']=pd.qcut(base_info_clean['reccap'], 10, labels=False,duplicates='drop')
#注册资本_实缴资本分箱
base_info_clean['regcap_diff_reccap']=base_info_clean['regcap_diff_reccap'].fillna(base_info_clean['regcap_diff_reccap'].median())
base_info_clean['bucket_regcap_diff_reccap']=pd.qcut(base_info_clean['regcap_diff_reccap'], 10, labels=False,duplicates='drop')
#
base_info_clean.shape

id的unique长度:24865
industryphy的unique长度:20
industryco的unique长度:346
enttype的unique长度:17
enttypeitem的unique长度:32
opform的unique长度:34
oploc的unique长度:5351
id_prefix的unique长度:65
dom_prefix的unique长度:412
id_dom的unique长度:24865
industryphy_industryco的unique长度:348
enttype_enttypeitem的unique长度:37
industryphy_enttype的unique长度:99
enttype_enttypeitem_industryphy_industryco的unique长度:1072
industryphy_id_prefix的unique长度:245
enttype_id_prefix的unique长度:116
industryphy_industryco_id_prefix的unique长度:1680
enttype_enttypeitem_id_prefix的unique长度:194
industryphy_dom_prefix的unique长度:773
enttype_dom_prefix的unique长度:597
enttypegb_industryphy的unique长度:250
enttypegb_enttype的unique长度:53
enttypegb_industryphy_enttype的unique长度:250


(24865, 45)

In [None]:
#以下各个类别对企业id做计数特征
#industryphy:行业类别代码
#industryco:行业细类代码
#dom:经营地址(object)
#opscope:经营范围(object)
#enttype:企业类型
#enttypeitem:企业类型小类
'''
count特征，选一个主键，如果选industryphy:行业类别代码做主键，与id做unique特征，则表示
某一个具体的行业类别对应了多少个企业。
'''
def gen_cnt_feature(df, feature):
    cnt_features = []
    for fea in feature:
        df[fea + '_count'] = df.groupby([fea])['id'].transform('count')
        cnt_features.append(fea + '_count')
    return df
'''
unique特征，选一个主键，如果选industryphy:行业类别代码做主键，与id做unique特征，则表示
某一个具体的行业类别对应了多少独有的id,如果与industryco:行业细类代码做unique特征，则表示某一个具体的行业类别对应
了多少个独有的行业细类别.
'''
def get_nunique(df,group_col,sta_col):
    for i in group_col:
        for j in sta_col:
            if i!=j:
                df[i+'_nunique_'+j] = df.groupby([i])[j].transform('nunique')
                #df[i+'_mode_'+j] = df.groupby([i])[j].transform(lambda x :x.mode()[0])
                #df[i+'_entropy_'+j] = df.groupby([i])[j].transform(lambda x: entropy(x.value_counts() / x.shape[0]))
    return df
#选取以下列作count特征
cnt_cols=['enttypegb','enttype','enttypeitem','industryphy','industryco','industryphy_industryco',
        'enttype_enttypeitem','industryphy_enttype','enttype_enttypeitem_industryphy_industryco',
         'enttypegb_enttype','enttypegb_industryphy']
base_info_clean=gen_cnt_feature(base_info_clean.copy(),cnt_cols)
group_col=['enttypegb','industryphy']
sta_col=['id','enttype','industryco','enttypeitem']
base_info_clean=get_nunique(base_info_clean.copy(),group_col,sta_col)
base_info_clean.shape

In [262]:
base_info_clean

Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,adbusign,...,enttypegb_industryphy,enttypegb_enttype,enttypegb_industryphy_enttype,null_enttypeminu,alive_year,regcap_diff_reccap,regcap_div_reccap,bucket_regcap,bucket_reccap,bucket_regcap_diff_reccap
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,M,7513,1100,1150,6,340223010010000000,340200000000115392,0,...,1151_M,1151_1100,1151_M_1100,0,16.0,52.0,-0.04,2,0,3
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,O,8090,9600,无,6,340222060010000000,340200000000112114,0,...,9600_O,9600_9600,9600_O_9600,1,38.0,12.0,-0.20,0,0,1
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,340202,R,9053,1100,1150,6,340202010010000000,400000000000753910,0,...,1151_R,1151_1100,1151_R_1100,0,2.0,102.0,-0.02,4,0,5
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,340221,L,7212,4500,4540,6,340221010010000000,400000000000013538,0,...,4540_L,4540_4500,4540_L_4500,1,62.0,12.0,-0.20,0,0,1
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,340202,R,8810,1100,1130,7,340200000000000000,400000000000283237,0,...,1130_R,1130_1100,1130_R_1100,1,36.0,102.0,-0.02,4,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24860,f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325,340225,O,8131,9600,无,6,340200000000116780,341400000000011622,0,...,9600_O,9600_9600,9600_O_9600,1,142.0,22.0,-0.10,1,0,2
24861,f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda,340207,J,6790,4500,4530,6,340200000000000000,340200000000115797,0,...,4533_J,4533_4500,4533_J_4500,0,59.0,110.0,0.00,5,0,5
24862,da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7,340207,O,8111,9600,无,6,340207030010000000,340200000000115275,0,...,9600_O,9600_9600,9600_O_9600,1,103.0,12.0,-0.20,0,0,1
24863,516ab81418ed215dcbbf0614a7b929e691f8eed153d7bb31,340225,O,8090,1100,1130,7,340200000000116750,341400000000015220,0,...,1130_O,1130_1100,1130_O_1100,1,98.0,0.0,1.00,5,2,0


- id:企业唯一标识
- ANCHEYEAR:年度
- STATE:状态
- FUNDAM:资金数额
- MEMNUM:成员人数
- FARNUM:农民人数
- ANNNEWMEMNUM:本年度新增成员人数
- ANNREDMEMNUM:本年度退出成员人数
- EMPNUM:从业人数
- EMPNUMSIGN:从业人数是否公示
- BUSSTNAME:经营状态名称
- COLGRANUM:其中高校毕业生人数经营者
- RETSOLNUM:其中退役士兵人数经营者
- DISPERNUM:其中残疾人人数经营者
- UNENUM:其中下岗失业人数经营者
- COLEMPLNUM:其中高校毕业生人数雇员
- RETEMPLNUM:其中退役士兵人数雇员
- DISEMPLNUM:其中残疾人人数雇员
- UNEEMPLNUM:其中下岗失业人数雇员
- WEBSITSIGN:是否有网站标志
- FORINVESTSIGN:是否有对外投资企业标志
- STOCKTRANSIGN:有限责任公司本年度是否发生股东股权转让标志
- PUBSTATE:公示状态：1 全部公示，2部分公示,3全部不公示]

In [263]:
#change_info
# [id:企业唯一标识, bgxmdm:变更信息代码, bgq:变更前, bgh:变更后, bgrq:变更日期]
change_info_clean=change_info.copy()
change_info_clean['bgrq']=pd.to_datetime(change_info_clean['bgrq'])
current_time=pd.to_datetime('2020-11-17 00:00:00')
#变更日期距今长度
change_info_clean['bgrq_gap']=(current_time-pd.to_datetime(change_info_clean['bgrq'])).dt.total_seconds()//3600//24//30
#将变更前、后的代码作一个交叉
change_info_clean['bgq']=change_info_clean['bgq'].fillna("无")
change_info_clean['bgh']=change_info_clean['bgh'].fillna("无")
new_col=cross_two(change_info_clean,'bgq','bgh')#作企业类型-小类的交叉特征
change_info_clean['bgq_bgh']=new_col
#
change_info_clean=change_info_clean.drop(['bgrq','bgq','bgh'],axis=1)
change_info_clean=change_info_clean.groupby('id',sort=False).agg({"bgxmdm":["count","sum","mean","std"],
                                             "bgrq_gap":["sum","mean","max","min"],
                                             "bgq_bgh":["median","mean"],
                                            }).reset_index()
change_info_clean.columns=["id","change_count","bgxmdm_sum","bgxmdm_mean","bgxmdm_std",
                           "bgrq_gap_sum","bgrq_gap_mean","bgrq_gap_max","bgrq_gap_min",
                           "bgq_bgh_median","bgq_bgh_mean"]
#other_info
#由于other信息缺失较多，所以对某一个id而言，将是否具有ohther信息作为一个特征
other_info_clean = other_info.groupby('id',sort=False).agg('mean')
other_info_clean=pd.DataFrame(other_info_clean).reset_index()
other_info_clean=other_info_clean.fillna(-2)
other_info_clean['has_other']=[1]*len(other_info_clean)
#news_info
#news_info
#找到一些异常数据单独处理
news_info_clean=news_info.copy()
public_date_values=news_info_clean['public_date'].values
for i in range(len(public_date_values)):
    if len(public_date_values[i])!=10:
        public_date_values[i]='2020-11-16'
news_info_clean['public_date']=public_date_values
news_info_clean['public_date']=pd.to_datetime(news_info_clean['public_date'])
current_time=pd.to_datetime('2020-11-17 00:00:00')
news_info_clean['news_gap']=(current_time-pd.to_datetime(news_info_clean['public_date'])).dt.total_seconds()//3600//24//30
news_info_clean=news_info_clean.drop(['public_date'],axis=1)
#对positive_negtive类型进行编码
news_info_clean['positive_negtive']=news_info_clean['positive_negtive'].fillna("中立")
#
dic={}
cate=news_info_clean.positive_negtive.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
#
news_info_clean['positive_negtive']=news_info_clean['positive_negtive'].map(dic)
news_info_clean = news_info_clean.groupby('id',sort=False).agg({'positive_negtive':['mean','min','max'],
                                                               'news_gap':['mean']}).reset_index()
news_info_clean.columns=['id','positive_negtive_mean','positive_negtive_min','positive_negtive_max','news_gap_mean']

#处理annual_report_info的数据
#空值大于0.5的列都删除掉
drop_nan_columns=get_nan_ratio(annual_report_info,thresh=0.5)[0]
annual_report_info_clean=annual_report_info.drop(drop_nan_columns,axis=1)
#对object类型进行编码
annual_report_info_clean['BUSSTNAME']=annual_report_info_clean['BUSSTNAME'].fillna("无")
dic = {'无':-1,'开业':0, '歇业':1, '停业':2, '清算':3}
annual_report_info_clean['BUSSTNAME']=annual_report_info_clean['BUSSTNAME'].map(dic)
#
annual_report_info_clean['sum_frt_1']=annual_report_info_clean['COLGRANUM']+annual_report_info_clean['RETSOLNUM']+annual_report_info_clean['DISPERNUM']+annual_report_info_clean['UNENUM']
annual_report_info_clean['sum_frt_2']=annual_report_info_clean['COLEMPLNUM']+annual_report_info_clean['RETEMPLNUM']+annual_report_info_clean['DISEMPLNUM']+annual_report_info_clean['UNEEMPLNUM']
annual_report_info_clean['ratio_1']=annual_report_info_clean['sum_frt_1']/annual_report_info_clean['EMPNUM']
annual_report_info_clean['ratio_2']=annual_report_info_clean['sum_frt_2']/annual_report_info_clean['EMPNUM']
annual_report_info_clean = annual_report_info_clean.groupby('id',sort=False).agg('median')
annual_report_info_clean=pd.DataFrame(annual_report_info_clean).reset_index()
tax_info_clean=tax_info.copy()
tax_info_clean["has_tax"]=[1]*len(tax_info_clean)
tax_info_clean['START_DATE']=pd.to_datetime(tax_info_clean['START_DATE'])
tax_info_clean['END_DATE']=pd.to_datetime(tax_info_clean['END_DATE'])
tax_info_clean['gap_day']=(tax_info_clean['END_DATE']-tax_info_clean['START_DATE']).dt.total_seconds()//3600//24
tax_info_clean=tax_info_clean.drop(['START_DATE','END_DATE'],axis=1)
tax_info_clean['TAX_CATEGORIES']=tax_info_clean['TAX_CATEGORIES'].fillna("无")#17 unique
tax_info_clean['TAX_ITEMS']=tax_info_clean['TAX_ITEMS'].fillna("无")#275 TAX_ITEMS
#对object类型进行编码
dic={}
cate=tax_info_clean.TAX_CATEGORIES.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
tax_info_clean['TAX_CATEGORIES']=tax_info_clean['TAX_CATEGORIES'].map(dic)
#
dic={}
cate=tax_info_clean.TAX_ITEMS.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
tax_info_clean['TAX_ITEMS']=tax_info_clean['TAX_ITEMS'].map(dic)
tax_info_clean['income']=tax_info_clean['TAX_AMOUNT']/tax_info_clean['TAX_RATE']
#
tax_info_clean=tax_info_clean.groupby('id',sort=False).agg({"TAX_CATEGORIES":["count","mean","std"],
                                             "TAX_ITEMS":["mean"],
                                             "TAXATION_BASIS":["mean"],
                                             "TAX_RATE":["mean"],
                                             "DEDUCTION":["mean"],
                                             "TAX_AMOUNT":["sum","mean"],
                                             "gap_day":["mean"],
                                             "income":["sum","mean"],
                                             "has_tax":["mean"]
                                            }).reset_index()
tax_info_clean.columns=['id','tax_count','TAX_CATEGORIES_mean','TAX_CATEGORIES_std','TAX_ITEMS','TAXATION_BASIS',
                           'TAX_RATE','DEDUCTION','TAX_AMOUNT_sum','TAX_AMOUNT_mean','gap_day','income_sum','income_mean','has_tax']
#税额分箱
tax_info_clean['TAX_AMOUNT_sum']=tax_info_clean['TAX_AMOUNT_sum'].fillna(tax_info_clean['TAX_AMOUNT_sum'].median())
tax_info_clean['bucket_TAX_AMOUNT']=pd.qcut(tax_info_clean['TAX_AMOUNT_sum'], 10, labels=False,duplicates='drop')
tax_info_clean=tax_info_clean.fillna(-2)
print('finished .............')

100%|████████████████████████████████████████████████████████████████████████| 45940/45940 [00:00<00:00, 178598.97it/s]


finished .............


## category、object_column 特征单独提取出来

为了避免在使用有监督编码过程中的特征穿越问题，不能笼统的针对所有的类别特征都采取同样的编码方式,

需要结合其自身的特点进行编码;

- 对于较为稀疏的cate列，类别数多，目标只有两类，因此很容易造成特征穿越。不适宜采取有监督编码。



In [264]:
#

In [265]:
#
all_data=base_info_clean.merge(annual_report_info_clean,how='outer')
all_data=all_data.merge(tax_info_clean,how='outer')
all_data=all_data.merge(change_info_clean,how='outer')
all_data=all_data.merge(news_info_clean,how='outer')
all_data=all_data.merge(other_info_clean,how='outer')
all_data=all_data.fillna(-2)
all_data.shape#,base_info.shape,annual_report_info.shape,tax_info.shape
train_df=all_data.merge(entprise_info)
test_df=all_data[all_data['id'].isin(entprise_evaluate['id'].unique().tolist())]
test_df=test_df.reset_index(drop=True)
print("all_data:{} train_df:{} test_df:{}".format(all_data.shape,train_df.shape,test_df.shape))

all_data:(24865, 98) train_df:(14865, 99) test_df:(10000, 98)


## 对OBJ_FEATURES特征进行编码

In [266]:
OBJ_FEATURES=all_data.select_dtypes(include=["object"]).columns.tolist()
OBJ_FEATURES.remove('id')

In [267]:
#编码之前先进行一个简单的统计(主要是针对特征类别的稀疏性)
ordinal_encode=[]
supervize_encode=[]
for cat in OBJ_FEATURES:
    if len(all_data)/len(all_data[cat].unique())<100:#>平均不足100个样本,因此很难表达其统计特性
        ordinal_encode.append(cat)
    else:
        supervize_encode.append(cat)
#
encoder_tools=Category_encoders_hepler()
#train_df,test_df=encoder_tools.encode(train_df,test_df,ordinal_encode,mode="ordinal")
#train_df,test_df=encoder_tools.encode(train_df,test_df,supervize_encode,mode="ordinal")
train_df,test_df=encoder_tools.encode(train_df,test_df,OBJ_FEATURES,mode="ordinal")

In [268]:
#
train_data=train_df.drop(['id','label'],axis=1)
kind=train_df['label']
test_data=test_df.drop(['id'],axis=1)
train_data.shape,test_data.shape

((14865, 97), (10000, 97))

In [269]:
train_data

Unnamed: 0,oplocdistrict,state,orgid,jobid,adbusign,townsign,regtype,empnum,compform,venind,...,enttype_enttypeitem_industryphy_industryco_ordinal,industryphy_id_prefix_ordinal,enttype_id_prefix_ordinal,industryphy_industryco_id_prefix_ordinal,enttype_enttypeitem_id_prefix_ordinal,industryphy_dom_prefix_ordinal,enttype_dom_prefix_ordinal,enttypegb_industryphy_ordinal,enttypegb_enttype_ordinal,enttypegb_industryphy_enttype_ordinal
0,340223,6,340223010010000000,340200000000115392,0,0,1,5.0,-2.0,-2.0,...,1,1,1,1,1,1,1,1,1,1
1,340202,6,340202010010000000,400000000000753910,0,0,1,2.0,-2.0,-2.0,...,2,2,2,2,2,2,1,2,1,2
2,340221,6,340221010010000000,400000000000013538,0,1,1,2.0,-2.0,-2.0,...,3,3,3,3,3,3,2,3,2,3
3,340202,7,340200000000000000,400000000000283237,0,0,1,-2.0,-2.0,-2.0,...,4,4,4,4,4,2,1,4,3,4
4,340222,6,340222080010000000,340200000000101006,0,1,1,5.0,1.0,3.0,...,5,5,5,5,5,4,3,5,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14860,340202,6,340202010010000000,340200000020001145,0,1,1,5.0,-2.0,-2.0,...,13,11,11,11,12,8,6,11,8,11
14861,340200,6,340200000000000000,340200000000111582,0,1,1,-2.0,-2.0,-2.0,...,146,64,4,234,4,125,22,46,3,46
14862,340202,6,340202010010000000,400000000000753910,0,0,1,1.0,-2.0,-2.0,...,79,79,2,322,2,20,22,15,1,15
14863,340202,6,340202010010000000,340200000000100093,0,1,1,-2.0,-2.0,-2.0,...,98,12,4,129,4,35,22,7,3,7


In [270]:
# # 伪标签
# use_pseudo=True
# if use_pseudo:
#     train_data=train_df.drop(['id','label'],axis=1)
#     kind=train_df['label']
#     pseudo_name=[]
#     pseudo_label={'id':[],'label':[]}
#     pseudo_df=pd.read_csv('submit_857_xgb_rf_lgb_cab.csv')
#     for index,name,score in pseudo_df.itertuples():
#         if score>0.9 or score<0.05:
#             pseudo_label['id'].append(name)
#             if score>0.9 :
#                 pseudo_label['label'].append(1)
#             else:
#                 pseudo_label['label'].append(0)
#             pseudo_name.append(name)
#     len(pseudo_name)
#     pseudo_data=test_df[test_df.id.isin(pseudo_name)].reset_index(drop=True)
#     pseudo_data=pseudo_data.merge(pd.DataFrame(pseudo_label))
#     #
#     train_df=pd.concat((train_df,pseudo_data)).reset_index(drop=True)
#     train_data=train_df.drop(['id','label'],axis=1)
#     kind=train_df['label']
#     print(train_data.shape,test_data.shape)
    

In [271]:
#特征筛选
# frt_select=[
#  'industryphy',
#  'enttypegb',
#  'regcap',
#  'townsign',
#  'industryco',
#  'bucket_regcap',
#  'empnum',
#  'bucket_reccap',
#  'enttypeitem',
#  'industryphy_industryco',
#  'reccap',
#  'FORINVESTSIGN',
#  'positive_negtive',
#  'regtype',
#  'STOCKTRANSIGN',
#  'bucket_regcap_reccap',
#  'enttypegb_enttypeitem',
#  'regcap_reccap',
#  'legal_judgment_num',
#  'TAX_CATEGORIES',
#  'TAX_AMOUNT',
#  'bgq_bgh',
#  'TAX_ITEMS']
# frt_select=important_frt[:30]
# train_data=train_data[frt_select]
# test_data=test_data[frt_select]
# cat_features=list(set(frt_select).intersection(set(cat_features)))
# cat_features

In [272]:
# def eval_score(y_test,y_pre):
#     _,_,f_class,_=precision_recall_fscore_support(y_true=y_test,y_pred=y_pre,labels=[0,1],average=None)
#     fper_class={'合法':f_class[0],'违法':f_class[1],'f1':f1_score(y_test,y_pre)}
#     return fper_class
#
def eval_score(y_test,y_pre):
    valid_f1 = f1_score(y_test, y_pre)
    valid_p = precision_score(y_test, y_pre)
    valid_r = recall_score(y_test, y_pre)
    F = valid_p*0.7+valid_r*0.2+valid_f1*0.1
    return F

#
def k_fold_serachParmaters(model,train_val_data,train_val_kind):
    mean_f1=0
    mean_f1Train=0
    n_splits=5
    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
    for train, test in sk.split(train_val_data, train_val_kind):
        x_train = train_val_data.iloc[train]
        y_train = train_val_kind.iloc[train]
        x_test = train_val_data.iloc[test]
        y_test = train_val_kind.iloc[test]

        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        weigted_score=  eval_score(y_test,pred)
        mean_f1+=weigted_score/n_splits
        #print(fper_class)
        
        pred_Train = model.predict(x_train)
        weigted_score_train =  eval_score(y_train,pred_Train)
        mean_f1Train+=weigted_score_train/n_splits
    #print('mean valf1:',mean_f1)
    #print('mean trainf1:',mean_f1Train)
    return mean_f1

In [202]:
def search_param(n_estimators,lr,max_depth,num_leaves):
    params={'num_leaves':num_leaves
                   ,'max_depth':max_depth
                   ,'learning_rate':lr
                   ,'n_estimators':n_estimators
                   ,'n_jobs':8
               }
    mean_f1=k_fold_serachParmaters(lightgbm.LGBMClassifier(**params),train_data,kind)
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [150]:
    print('n_estimators:',n_estimators)
    for lr in [0.05,0.06,0.07,0.1]:
        for max_depth in [7,8,9,13,11]:
            for num_leaves in [7,9,11,13]:
                mean_f1=search_param(n_estimators,lr,max_depth,num_leaves)
                if mean_f1>best:
                    param=[n_estimators,lr,max_depth,num_leaves]
                    best=mean_f1
                    print(param,best)#[150, 0.07, 13, 11] 0.8370422016216938

n_estimators: 150
[150, 0.05, 7, 7] 0.8308745056274318
[150, 0.05, 7, 11] 0.8314844234174977
[150, 0.05, 8, 11] 0.8319370224286234
[150, 0.06, 7, 13] 0.8338354962736693
[150, 0.06, 13, 13] 0.8341930790252486
[150, 0.07, 13, 11] 0.8370422016216938


In [208]:
#
def search_param(n_estimators,lr,max_depth,min_child_weight):
    params={'max_depth':max_depth
              ,'learning_rate':lr
              ,'n_estimators':n_estimators
              ,'min_child_weight':min_child_weight
              ,'n_jobs':8
              ,'importance_type':'total_cover'
           }
    
    mean_f1=k_fold_serachParmaters(xgboost.XGBClassifier(**params),train_data,kind)
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [80]:
    print('n_estimators:',n_estimators)
    for lr in [0.05,0.07,0.09,0.1]:
        for max_depth in [7,8,9,10]:
            for min_child_weight in [7,9,11,13,15]:
                mean_f1=search_param(n_estimators,lr,max_depth,min_child_weight)
                if mean_f1>best:
                    param=[n_estimators,lr,max_depth,min_child_weight]
                    best=mean_f1
                    print(param,best)#[50, 0.04, 4, 13] 0.8449774193716448

n_estimators: 80
[80, 0.05, 7, 7] 0.8256217607168728
[80, 0.05, 7, 9] 0.8284444493130954
[80, 0.05, 10, 9] 0.829558150242842
[80, 0.07, 7, 7] 0.8299533354880131
[80, 0.07, 10, 9] 0.8304676811777514


In [214]:
def search_param(iter_cnt,lr,max_depth):
    
    params={'iterations':iter_cnt
              ,'learning_rate':lr
              ,'depth':max_depth
              ,'silent':True
              ,'thread_count':8
              ,'task_type':'CPU'
        }
    mean_f1=k_fold_serachParmaters(catboost.CatBoostClassifier(**params),train_data,kind)
    return mean_f1

#搜索最佳参数[52, 0.073, 7] 0.8440488273317126
param=[]
best=0
for iter_cnt in [100]:
    print('iter_cnt:',iter_cnt)
    for lr in [0.08,0.1]:
        for max_depth in [8]:
            mean_f1=search_param(iter_cnt,lr,max_depth)
            if mean_f1>best:
                param=[iter_cnt,lr,max_depth]#[55, 0.076, 7]
                best=mean_f1
                print(param,best)
print(param,best)#筛选特征:[70, 0.06 8] 0.8417684642475657 所有特征:[54, 0.07, 7] 0.8411337269934891


iter_cnt: 100
[100, 0.08, 8] 0.8385293441728544
[100, 0.08, 8] 0.8385293441728544


In [219]:
# #
def search_param(n_estimators,max_depth,min_samples_split):
    params={'oob_score':True,
            'random_state':2020,
            'n_estimators': n_estimators,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split
            }
    mean_f1=k_fold_serachParmaters(RandomForestClassifier(**params),train_data,kind)
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [60]:
    print('n_estimators:',n_estimators)
    for min_samples_split in [8,9]:
        for max_depth in [7,9,11]:
            mean_f1=search_param(n_estimators,max_depth,min_samples_split)
            if mean_f1>best:
                param=[n_estimators,max_depth,min_samples_split]
                best=mean_f1
                print(param,best)#[60, 11, 9] 0.8442795786278773



n_estimators: 60
[60, 7, 8] 0.8124315055365392
[60, 9, 8] 0.832002118438721
[60, 11, 8] 0.8325276971301041
[60, 11, 9] 0.8356076453263122


In [222]:
rf_v1_params = {
    'n_jobs': -1,
    'n_estimators': 50,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0,
    'random_state':2020,
}
rf_v2_params = {
    'oob_score':True,
    'random_state':2020,
    'n_estimators': 60,
    'max_depth':11,
    'min_samples_split':9
}
# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':60,
    'max_features': 0.5,
    'max_depth': 20,
    'min_samples_leaf': 1,
    'verbose': 0,
    'random_state':2020,
}


# Gradient Boosting parameters
gb_params = {
    'n_estimators': 60,
    'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0,
    'random_state':2020,
}
#
lgb_params = {'num_leaves':11
                ,'max_depth':13
                ,'learning_rate':0.07
                ,'n_estimators':150
                ,'n_jobs':8
             }
xgb_params ={'max_depth':10
              ,'learning_rate':0.07
              ,'n_estimators':80
             ,'min_child_weight':9
              ,'n_jobs':8
              ,'importance_type':'total_cover'
            }
cab_params={'iterations':70
          ,'learning_rate':0.06
          ,'depth':8
          ,'silent':True
          ,'thread_count':8
          ,'task_type':'CPU'
         # ,'cat_features':CAT_FEATURES_INT
}

rf_v1 = RandomForestClassifier(**rf_v1_params)
print("RandomForestClassifier_v1:",k_fold_serachParmaters(rf_v1,train_data,kind))
rf_v2 = RandomForestClassifier(**rf_v2_params)
print("RandomForestClassifier_v2:",k_fold_serachParmaters(rf_v2,train_data,kind))
et = ExtraTreesClassifier(**et_params)
print("ExtraTreesClassifier:",k_fold_serachParmaters(et,train_data,kind))
gb = GradientBoostingClassifier(**gb_params)
print("GradientBoostingClassifier:",k_fold_serachParmaters(gb,train_data,kind))
lgb=lightgbm.LGBMClassifier(**lgb_params)
print('LGBMClassifier:',k_fold_serachParmaters(lgb,train_data,kind))
xgb=xgboost.XGBClassifier(**xgb_params)
print('XGBClassifier:',k_fold_serachParmaters(xgb,train_data,kind))
cab=catboost.CatBoostClassifier(**cab_params)
print('CatBoostClassifier:',k_fold_serachParmaters(cab,train_data,kind))

RandomForestClassifier_v1: 0.8685179961087608
RandomForestClassifier_v2: 0.8356076453263122
ExtraTreesClassifier: 0.8330043205908855
GradientBoostingClassifier: 0.8339789173746215
LGBMClassifier: 0.8370422016216938
XGBClassifier: 0.8304676811777514
CatBoostClassifier: 0.8355527942567414


In [273]:
rf_v1_params = {
    'n_jobs': -1,
    'n_estimators': 50,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0,
    'random_state':2020,
}
rf_v2_params = {
    'oob_score':True,
    'random_state':2020,
    'n_estimators': 60,
    'max_depth':11,
    'min_samples_split':9
}
# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':60,
    'max_features': 0.5,
    'max_depth': 20,
    'min_samples_leaf': 1,
    'verbose': 0,
    'random_state':2020,
}


# Gradient Boosting parameters
gb_params = {
    'n_estimators': 60,
    'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0,
    'random_state':2020,
}
#
lgb_params = {'num_leaves':11
                ,'max_depth':13
                ,'learning_rate':0.07
                ,'n_estimators':150
                ,'n_jobs':8
             }
xgb_params ={'max_depth':10
              ,'learning_rate':0.07
              ,'n_estimators':80
             ,'min_child_weight':9
              ,'n_jobs':8
              ,'importance_type':'total_cover'
            }
cab_params={'iterations':70
          ,'learning_rate':0.06
          ,'depth':8
          ,'silent':True
          ,'thread_count':8
          ,'task_type':'CPU'
         # ,'cat_features':CAT_FEATURES_INT
}

rf_v1 = RandomForestClassifier(**rf_v1_params)
print("RandomForestClassifier_v1:",k_fold_serachParmaters(rf_v1,train_data,kind))
rf_v2 = RandomForestClassifier(**rf_v2_params)
print("RandomForestClassifier_v2:",k_fold_serachParmaters(rf_v2,train_data,kind))
et = ExtraTreesClassifier(**et_params)
print("ExtraTreesClassifier:",k_fold_serachParmaters(et,train_data,kind))
gb = GradientBoostingClassifier(**gb_params)
print("GradientBoostingClassifier:",k_fold_serachParmaters(gb,train_data,kind))
lgb=lightgbm.LGBMClassifier(**lgb_params)
print('LGBMClassifier:',k_fold_serachParmaters(lgb,train_data,kind))
xgb=xgboost.XGBClassifier(**xgb_params)
print('XGBClassifier:',k_fold_serachParmaters(xgb,train_data,kind))
cab=catboost.CatBoostClassifier(**cab_params)
print('CatBoostClassifier:',k_fold_serachParmaters(cab,train_data,kind))

RandomForestClassifier_v1: 0.8725420817706592
RandomForestClassifier_v2: 0.8195476881154073
ExtraTreesClassifier: 0.8143537162924571
GradientBoostingClassifier: 0.8187633045803323
LGBMClassifier: 0.8166846627861111
XGBClassifier: 0.821511379363949
CatBoostClassifier: 0.8250984560890233


In [274]:
# #
details = []
answers = []
mean_f1=0
n_splits=5
sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
cnt=0
rf=rf_v2
for train, test in sk.split(train_data, kind):
    x_train = train_data.iloc[train]
    y_train = kind.iloc[train]
    x_test = train_data.iloc[test]
    y_test = kind.iloc[test]

    xgb.fit(x_train, y_train)
    pred_xgb = xgb.predict(x_test)
    weight_xgb = eval_score(y_test,pred_xgb)

    lgb.fit(x_train, y_train)
    pred_llf = lgb.predict(x_test)
    weight_lgb = eval_score(y_test,pred_llf)

    cab.fit(x_train, y_train)
    pred_cab = cab.predict(x_test)
    weight_cab =  eval_score(y_test,pred_cab)

    rf.fit(x_train, y_train)
    pred_rf = rf.predict(x_test)
    weight_rf =  eval_score(y_test,pred_rf)


    prob_xgb = xgb.predict_proba(x_test)
    prob_lgb = lgb.predict_proba(x_test)
    prob_cab = cab.predict_proba(x_test)
    prob_rf = rf.predict_proba(x_test)
    #print(prob_xgb.shape,prob_lgb.shape,prob_cab.shape,prob_rf.shape,)

    scores = []
    ijkl = []
    weight = np.arange(0, 1.05, 0.1)
    for i, item1 in enumerate(weight):
        for j, item2 in enumerate(weight[weight <= (1 - item1)]):
            for k, item3 in enumerate(weight[weight <= (1 - item1-item2)]):
                prob_end = prob_xgb * item1 + prob_lgb * item2 + prob_cab *item3+prob_rf*(1 - item1 - item2-item3)
                #prob_end = np.sqrt(prob_xgb**2 * item1 + prob_lgb**2 * item2 + prob_cab**2 *item3+prob_rf**2*(1 - item1 - item2-item3))
                score = eval_score(y_test,np.argmax(prob_end,axis=1))
                scores.append(score)
                ijkl.append((item1, item2,item3, 1 - item1 - item2-item3))

    ii = ijkl[np.argmax(scores)][0]
    jj = ijkl[np.argmax(scores)][1]
    kk = ijkl[np.argmax(scores)][2]
    ll = ijkl[np.argmax(scores)][3]

    details.append(max(scores))
    details.append(weight_xgb)
    details.append(weight_lgb)
    details.append(weight_cab)
    details.append(weight_rf)
    details.append(ii)
    details.append(jj)
    details.append(kk)
    details.append(ll)

    cnt+=1
    print('每{}次验证的f1:{}'.format(cnt,max(scores)))
    mean_f1+=max(scores)/n_splits

    test_xgb = xgb.predict_proba(test_data)
    test_lgb = lgb.predict_proba(test_data)
    test_cab = cab.predict_proba(test_data)
    test_rf = rf.predict_proba(test_data)
    #加权平均
    ans = test_xgb * ii + test_lgb * jj + test_cab * kk + test_rf*ll#加权平均
    #加权平方平均
    #ans = np.sqrt(test_xgb**2 * ii + test_lgb**2 * jj + test_cab**2 * kk + test_rf**2*ll)
    answers.append(ans)
print('mean f1:',mean_f1)

每1次验证的f1:0.8518143385936884
每2次验证的f1:0.8411379589995327
每3次验证的f1:0.8265306122448979
每4次验证的f1:0.8427443301684486
每5次验证的f1:0.8354570254497738
mean f1: 0.8395368530912684


In [275]:
df=pd.DataFrame(np.array(details).reshape(int(len(details)/9),9)
                ,columns=['test_end_score','xgboost','lightgbm','catboost','rf'
                ,'weight_xgboost','weight_lightgbm','weight_catboost','weight_rf'])
df

Unnamed: 0,test_end_score,xgboost,lightgbm,catboost,rf,weight_xgboost,weight_lightgbm,weight_catboost,weight_rf
0,0.851814,0.838174,0.824277,0.824999,0.844433,0.0,0.3,0.2,0.5
1,0.841138,0.838029,0.819034,0.831633,0.810354,0.6,0.1,0.2,0.1
2,0.826531,0.812136,0.812092,0.810755,0.795724,0.7,0.1,0.1,0.1
3,0.842744,0.825195,0.831633,0.822648,0.823163,0.2,0.3,0.3,0.2
4,0.835457,0.794022,0.796389,0.835457,0.824065,0.0,0.0,1.0,0.0


In [276]:
df.mean()

test_end_score     0.839537
xgboost            0.821511
lightgbm           0.816685
catboost           0.825098
rf                 0.819548
weight_xgboost     0.300000
weight_lightgbm    0.160000
weight_catboost    0.360000
weight_rf          0.180000
dtype: float64

In [277]:
#
fina=sum(answers)/n_splits#
#fina=np.sqrt(sum(np.array(answers)**2)/n_splits)#平方平均
fina=fina[:,1]
test_df['score']=fina#可选:fina_persudo是伪标签的预测结果
submit_csv=test_df[['id','score']]
save_path='../submission/'+'submit_'+str(int(mean_f1*1000))+'.csv'
submit_csv.to_csv(save_path,index=False)
save_path

'../submission/submit_839.csv'

In [278]:
submit_csv

Unnamed: 0,id,score
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,0.001898
1,da8691b210adb3f67820f5e0c87b337d63112cee52211888,0.004178
2,9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17,0.001383
3,f000950527a6feb6ed308bc4c7ae11276eab86480f8e03db,0.003688
4,f000950527a6feb617e8d6ca7025dcf9d765429969122069,0.004439
...,...,...
9995,f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325,0.012233
9996,f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda,0.453708
9997,da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7,0.012285
9998,516ab81418ed215dcbbf0614a7b929e691f8eed153d7bb31,0.039807


In [279]:
#
def cnt_result(xx):
    cnt_re={0:0,1:0}
    for a in xx:
        if a<=0.5:
            cnt_re[0]+=1
        else:
            cnt_re[1]+=1
    return cnt_re
cnt_re=cnt_result(fina)
print("预测结果中合法的数量%d;违法的数量%d,合法/违法%f"%(cnt_re[0],cnt_re[1],cnt_re[0]/cnt_re[1]))
#

预测结果中合法的数量9090;违法的数量910,合法/违法9.989011


In [280]:

#观察训练/验证过程
# df=pd.DataFrame(np.array(details).reshape(int(len(details)/7),7)
#                 ,columns=['test_end_score','xgboost','lightgbm','catboost'
#                 ,'weight_xgboost','weight_lightgbm','weight_catboost'])
# df

# df.mean()

xlf_impt=xgb.feature_importances_
llf_impt=lgb.feature_importances_/sum(lgb.feature_importances_)
clf_impt=cab.feature_importances_/sum(cab.feature_importances_)
rf_impt=rf.feature_importances_/sum(rf.feature_importances_)

importance=pd.DataFrame({
    'column':train_data.columns,
    'importance':xlf_impt+llf_impt+clf_impt+rf_impt,
}).sort_values(by='importance')
importance=importance.reset_index(drop=True)
important_frt=list(importance['column'].values)
important_frt.reverse()

In [281]:
important_frt

['alive_year',
 'enttypegb_industryphy_ordinal',
 'industryphy_enttype_ordinal',
 'enttypeitem_ordinal',
 'tfidif_opscope',
 'industryphy_dom_prefix_ordinal',
 'enttypegb_enttype_ordinal',
 'industryphy_ordinal',
 'regcap',
 'enttype_enttypeitem_ordinal',
 'enttypeminu',
 'regcap_div_reccap',
 'enttypegb',
 'news_gap_mean',
 'reccap',
 'enttypegb_industryphy_enttype_ordinal',
 'regcap_diff_reccap',
 'industryphy_industryco_id_prefix_ordinal',
 'bucket_regcap',
 'enttype_enttypeitem_industryphy_industryco_ordinal',
 'id_dom_ordinal',
 'industryphy_id_prefix_ordinal',
 'industryco_ordinal',
 'enttype_ordinal',
 'townsign',
 'oplocdistrict',
 'EMPNUM',
 'jobid',
 'empnum',
 'enttype_enttypeitem_id_prefix_ordinal',
 'bgxmdm_std',
 'bgq_bgh_median',
 'bgq_bgh_mean',
 'enttype_dom_prefix_ordinal',
 'industryphy_industryco_ordinal',
 'bgxmdm_mean',
 'dom_prefix_ordinal',
 'bucket_regcap_diff_reccap',
 'bgxmdm_sum',
 'bucket_reccap',
 'COLGRANUM',
 'enttype_id_prefix_ordinal',
 'FORINVESTSIGN'

# 两层stacking

In [282]:
# 
use_selectFrt=False
ntrain = train_data.shape[0]
ntest = test_data.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
#kf = KFold(n_splits= NFOLDS, random_state=SEED)
kf=StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2020)
print("-------->>>>>定义一个Sklearn classifier 的扩展<<<<<--------")
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)[:,1]
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
#"-----------------定义oof:stacking的核心流程-------------"
print("-------->>>>>定义oof:stacking的核心流程<<<<<--------")
'''
'''
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))#(14865,)
    oof_test = np.zeros((ntest,))#(10000,)
    oof_test_skf = np.empty((NFOLDS, ntest))#(5, 10000)
    #
    fold=0
    for train_index, test_index in sk.split(x_train, y_train):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]#该折余下的验证集

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)#对该折余下的验证集做预测，将结果填补在这些数据在原始数据中的位置
        oof_test_skf[fold, :] = clf.predict_proba(x_test)#用此时的模型（第i折的模型）对测试做预测，放在第i折对应的位置
        #oof_train[test_index] = clf.predict(x_te)
        #oof_test_skf[fold, :] = clf.predict(x_test)
        fold+=1

    oof_test[:] = oof_test_skf.mean(axis=0)#将这N折模型对测试集的预测结果进行一个平均，作为改模型的预测结果
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
#
'''
第一层的基模型
'''
# Put in our parameters for said classifiers
# Random Forest parameters
# 
print("-------->>>>>第一层的基模型<<<<<--------")
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_v2_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
lgb= SklearnHelper(clf=lightgbm.LGBMClassifier, seed=SEED, params=lgb_params)
xgb = SklearnHelper(clf=xgboost.XGBClassifier, seed=SEED, params=xgb_params)
cab=SklearnHelper(clf=catboost.CatBoostClassifier, seed=SEED, params=cab_params)

#
#
y_train = kind

x_train = train_data.values # Creates an array of the train data
x_test = test_data.values # Creats an array of the test data
#
print("-------->>>>>训练、测试数据<<<<<--------")
print("y_train:{};train_data:{};test_data:{}".format(y_train.shape,train_data.shape,test_data.shape))
#
# Create our OOF train and test predictions. These base results will be used as new features
print("-------->>>>>第一阶段训练oof<<<<<--------")
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
print("Extra Trees.............")
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
print("Random Forest.............")
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
print("Gradient.............")
lgb_oof_train, lgb_oof_test = get_oof(lgb,x_train, y_train, x_test) # Gradient Boost
print("LGBClassifier.............")
xgb_oof_train, xgb_oof_test = get_oof(xgb,x_train, y_train, x_test) # XGBClassifier
print("XGBClassifier.............")
cab_oof_train, cab_oof_test = get_oof(cab,x_train, y_train, x_test)# CatClassifier
print("CatClassifier.............")

print("Training Stage_1 is complete")

-------->>>>>定义一个Sklearn classifier 的扩展<<<<<--------
-------->>>>>定义oof:stacking的核心流程<<<<<--------
-------->>>>>第一层的基模型<<<<<--------
-------->>>>>训练、测试数据<<<<<--------
y_train:(14865,);train_data:(14865, 97);test_data:(10000, 97)
-------->>>>>第一阶段训练oof<<<<<--------
Extra Trees.............
Random Forest.............
Gradient.............
LGBClassifier.............
XGBClassifier.............
CatClassifier.............
Training Stage_1 is complete


# stage-2的调参

In [240]:
def search_param(n_estimators,lr,max_depth,num_leaves):
    params={'num_leaves':num_leaves
                   ,'max_depth':max_depth
                   ,'learning_rate':lr
                   ,'n_estimators':n_estimators
                   ,'n_jobs':8
               }
    mean_f1=k_fold_serachParmaters(lightgbm.LGBMClassifier(**params),pd.DataFrame(x_train),pd.DataFrame(y_train))
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [80]:
    print('n_estimators:',n_estimators)
    for lr in [0.04,0.05,0.06]:
        for max_depth in [4,5,6,7,8]:
            for num_leaves in [4,5,6,7,8]:
                mean_f1=search_param(n_estimators,lr,max_depth,num_leaves)
                if mean_f1>best:
                    param=[n_estimators,lr,max_depth,num_leaves]
                    best=mean_f1
                    print(param,best)#[80, 0.05, 4, 4] 0.8419565786981478

n_estimators: 80
[80, 0.04, 4, 4] 0.8389775244206701
[80, 0.05, 4, 4] 0.8399405566673567


In [241]:
def search_param(n_estimators,lr,max_depth,min_child_weight):
    params={'max_depth':max_depth
              ,'learning_rate':lr
              ,'n_estimators':n_estimators
              ,'min_child_weight':min_child_weight
              ,'n_jobs':8
              ,'importance_type':'total_cover'
           }
    
    mean_f1=k_fold_serachParmaters(xgboost.XGBClassifier(**params),pd.DataFrame(x_train),pd.DataFrame(y_train))
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [50]:
    print('n_estimators:',n_estimators)
    for lr in [0.03,0.04]:
        for max_depth in [3,4,5,6]:
            for min_child_weight in [5,7,9,11,13,15]:
                mean_f1=search_param(n_estimators,lr,max_depth,min_child_weight)
                if mean_f1>best:
                    param=[n_estimators,lr,max_depth,min_child_weight]
                    best=mean_f1
                    print(param,best)#[50, 0.04, 4, 13] 0.8449774193716448

n_estimators: 50
[50, 0.03, 3, 5] 0.8285658885103605
[50, 0.03, 3, 7] 0.830231529550203
[50, 0.03, 3, 9] 0.8316167765653166
[50, 0.03, 3, 11] 0.8325856853741764
[50, 0.03, 3, 13] 0.835269062766804


In [242]:
def search_param(iter_cnt,lr,max_depth):
    
    params={'iterations':iter_cnt
              ,'learning_rate':lr
              ,'depth':max_depth
              ,'silent':True
              ,'thread_count':8
              ,'task_type':'CPU'
        }
    mean_f1=k_fold_serachParmaters(catboost.CatBoostClassifier(**params),pd.DataFrame(x_train),pd.DataFrame(y_train))
    return mean_f1

#搜索最佳参数[52, 0.073, 7] 0.8440488273317126
param=[]
best=0
for iter_cnt in [60,55]:
    print('iter_cnt:',iter_cnt)
    for lr in [0.03,0.04,0.05]:
        for max_depth in [4,5,6]:
            mean_f1=search_param(iter_cnt,lr,max_depth)
            if mean_f1>best:
                param=[iter_cnt,lr,max_depth]#[55, 0.04, 5] 0.8475966668074264
                best=mean_f1
                print(param,best)
print(param,best)#[55, 0.04, 5] 0.8475966668074264


iter_cnt: 60
[60, 0.03, 4] 0.8368955331348376
[60, 0.04, 5] 0.8373802817468526
[60, 0.05, 5] 0.8375477186456622
iter_cnt: 55
[60, 0.05, 5] 0.8375477186456622


In [243]:
# #
def search_param(n_estimators,max_depth,min_samples_split):
    params={'oob_score':True,
            'random_state':2020,
            'n_estimators': n_estimators,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split
            }
    mean_f1=k_fold_serachParmaters(RandomForestClassifier(**params),pd.DataFrame(x_train),pd.DataFrame(y_train))
    return mean_f1

#搜索最佳参数
param=[]
best=0
for n_estimators in [40,50]:
    print('n_estimators:',n_estimators)
    for min_samples_split in [4,5,7,8]:
        for max_depth in [4,5,7,9,]:
            mean_f1=search_param(n_estimators,max_depth,min_samples_split)
            if mean_f1>best:
                param=[n_estimators,max_depth,min_samples_split]
                best=mean_f1
                print(param,best)#[50, 4, 8] 0.8442795786278773



n_estimators: 40
[40, 4, 4] 0.8314564206726864
[40, 5, 4] 0.8334650927578162
[40, 5, 5] 0.8346889391653527
[40, 5, 7] 0.8352733404040108
[40, 5, 8] 0.8364672803657551
n_estimators: 50
[50, 5, 4] 0.8365538263042755
[50, 5, 7] 0.8371252073213781


In [283]:
#stage_2的模型调参

lgb_params_stage2 = {'num_leaves':4
                ,'max_depth':4
                ,'learning_rate':0.05
                ,'n_estimators':80
                ,'n_jobs':8
             }
xgb_params_stage2 ={'max_depth':3
              ,'learning_rate':0.03
              ,'n_estimators':50
             ,'min_child_weight':13
              ,'n_jobs':8
              ,'importance_type':'total_cover'
            }
cab_params_stage2={'iterations':60
          ,'learning_rate':0.05
          ,'depth':5
          ,'silent':True
          ,'thread_count':8
          ,'task_type':'CPU'
          #,'cat_features':cat_features
}
rf_params_stage2={'oob_score':True,
            'random_state':2020,
            'n_estimators': 50,
            'max_depth':7,
            'min_samples_split':7
            }
# #
x_train = np.concatenate(( et_oof_train, rf_oof_train, gb_oof_train,lgb_oof_train, xgb_oof_train,cab_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test,gb_oof_test,lgb_oof_test, xgb_oof_test, cab_oof_test), axis=1)
#
lgb_stage2=lightgbm.LGBMClassifier(**lgb_params_stage2)
print('LGBMClassifier:',k_fold_serachParmaters(lgb_stage2,pd.DataFrame(x_train),pd.DataFrame(y_train)))
xgb_stage2=xgboost.XGBClassifier(**xgb_params_stage2)
print('XGBClassifier:',k_fold_serachParmaters(xgb_stage2,pd.DataFrame(x_train),pd.DataFrame(y_train)))
cab_stage2=catboost.CatBoostClassifier(**cab_params_stage2)
print('CatBoostClassifier:',k_fold_serachParmaters(cab_stage2,pd.DataFrame(x_train),pd.DataFrame(y_train)))
rf_stage2 = RandomForestClassifier(**rf_params_stage2)
print("RandomForestClassifier_v1:",k_fold_serachParmaters(rf_stage2,pd.DataFrame(x_train),pd.DataFrame(y_train)))

LGBMClassifier: 0.8241452460027052
XGBClassifier: 0.820427862402487
CatBoostClassifier: 0.8191138393920616
RandomForestClassifier_v1: 0.8215444728633191


In [284]:
def cnt_result(xx):
    cnt_re={0:0,1:0}
    for a in xx:
        if a<=0.5:
            cnt_re[0]+=1
        else:
            cnt_re[1]+=1
    return cnt_re
print("Training is complete")
print("------第二阶段训练开始--------")
#
predictions_stage2=[]
for model_two_stage in [xgb_stage2,cab_stage2,rf_stage2]:
    #
    model_two_stage.fit(x_train, y_train)
    predictions = model_two_stage.predict_proba(x_test)[:,1]
    predictions_stage2.append(predictions)
    cnt_re=cnt_result(predictions)
    print("预测结果中合法的数量%d;违法的数量%d,合法/违法%f"%(cnt_re[0],cnt_re[1],cnt_re[0]/cnt_re[1]))
#
predictions=sum(predictions_stage2)/len(predictions_stage2)#
#predictions=np.sqrt(sum(np.array(np.array(predictions_stage2))**2)/len(predictions_stage2))#平方平均
#predictions=pow(np.prod(np.array(predictions_stage2), axis=0),1/len(predictions_stage2))#几何平均
cnt_re=cnt_result(predictions)
print("预测结果中合法的数量%d;违法的数量%d,合法/违法%f"%(cnt_re[0],cnt_re[1],cnt_re[0]/cnt_re[1]))

Training is complete
------第二阶段训练开始--------
预测结果中合法的数量9079;违法的数量921,合法/违法9.857763
预测结果中合法的数量9027;违法的数量973,合法/违法9.277492
预测结果中合法的数量9067;违法的数量933,合法/违法9.718114
预测结果中合法的数量9062;违法的数量938,合法/违法9.660981


In [285]:
votes=[(pre>0.5).astype(int) for pre in predictions_stage2]
vote_most=[]
for i in range(len(predictions_stage2[0])):
    vote_list=np.array(votes)[:,i]
    tmp = {0: 0, 1: 0}
    for k in vote_list:
        tmp[k] += 1
    #
    most = sorted(tmp.items(), key=lambda item: item[1])[-1][0]
    vote_most.append(most)
cnt_re=cnt_result(vote_most)
print("投票结果中合法的数量%d;违法的数量%d,合法/违法%f"%(cnt_re[0],cnt_re[1],cnt_re[0]/cnt_re[1]))

投票结果中合法的数量9066;违法的数量934,合法/违法9.706638


In [286]:
accuracy_score((predictions>0.5).astype(int),vote_most)

0.9988

In [287]:

test_df['score']=vote_most#predictions#可选:fina_persudo是伪标签的预测结果
submit_csv=test_df[['id','score']]
save_path='../submission/submit_stack_cab_lgb_xgb_rf'+str(cnt_re[1])+'.csv'
submit_csv.to_csv(save_path,index=False)
save_path

'../submission/submit_stack_cab_lgb_xgb_rf934.csv'

In [257]:
submit_csv

Unnamed: 0,id,score
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,0.017476
1,da8691b210adb3f67820f5e0c87b337d63112cee52211888,0.017356
2,9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17,0.017639
3,f000950527a6feb6ed308bc4c7ae11276eab86480f8e03db,0.018168
4,f000950527a6feb617e8d6ca7025dcf9d765429969122069,0.018279
...,...,...
9995,f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325,0.019584
9996,f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda,0.481104
9997,da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7,0.017947
9998,516ab81418ed215dcbbf0614a7b929e691f8eed153d7bb31,0.026300
