# 企业非法集资风险预测

导入所需的库

In [1]:
import pandas as pd
import numpy as np
import datetime
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.metrics import f1_score,precision_recall_fscore_support
from bayes_opt import BayesianOptimization
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

## 数据分析

读取所有文件，导入数据集

In [2]:
base_info = pd.read_csv('base_info.csv')
annual_report_info = pd.read_csv('annual_report_info.csv')
tax_info = pd.read_csv('tax_info.csv')
change_info = pd.read_csv('change_info.csv')
news_info = pd.read_csv('news_info.csv')
other_info = pd.read_csv('other_info.csv')
entprise_info = pd.read_csv('entprise_info.csv')
entprise_evaluate = pd.read_csv('entprise_evaluate.csv')

根据提供的信息，可知最后两个分别为训练集标签与待预测的验证集标签，只有id与标签两列

各数据集样本数量以及所包含的企业数量

In [3]:
print('base_info shape: {}; base_info unique: {}'.format(base_info.shape, len(base_info['id'].unique())))
print('annual_report_info shape: {}; annual_report_info unique: {}'.format(annual_report_info.shape, len(annual_report_info['id'].unique())))
print('tax_info shape: {}; tax_info unique: {}'.format(tax_info.shape, len(tax_info['id'].unique())))
print('change_info shape: {}; change_info unique: {}'.format(change_info.shape, len(change_info['id'].unique())))
print('news_info shape: {}; news_info unique: {}'.format(news_info.shape, len(news_info['id'].unique())))
print('other_info shape: {}; other_info unique: {}'.format(other_info.shape, len(other_info['id'].unique())))
print('entprise_info shape: {}; entprise_info unique: {}'.format(entprise_info.shape, len(entprise_info['id'].unique())))
print('entprise_evaluate shape: {}; entprise_evaluate unique: {}'.format(entprise_evaluate.shape, len(entprise_evaluate['id'].unique())))

base_info shape: (24865, 33); base_info unique: 24865
annual_report_info shape: (22550, 23); annual_report_info unique: 8937
tax_info shape: (29195, 9); tax_info unique: 808
change_info shape: (45940, 5); change_info unique: 8726
news_info shape: (10518, 3); news_info unique: 927
other_info shape: (1890, 4); other_info unique: 1888
entprise_info shape: (14865, 2); entprise_info unique: 14865
entprise_evaluate shape: (10000, 2); entprise_evaluate unique: 10000


从以上信息可以看出，整个数据集共有24865家企业，除了base_info完整地提供了各企业的基本信息，其他数据集的企业附加信息并不完整，有很多企业并没有提供，类似新闻信息这一类更是缺失严重。除此之外，我们还能看到作为测试集的样本有14865条，作为验证集的有10000条

查看前六个数据集的信息

In [4]:
base_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             24865 non-null  object 
 1   oplocdistrict  24865 non-null  int64  
 2   industryphy    24865 non-null  object 
 3   industryco     24864 non-null  float64
 4   dom            24865 non-null  object 
 5   opscope        24865 non-null  object 
 6   enttype        24865 non-null  int64  
 7   enttypeitem    16651 non-null  float64
 8   opfrom         24865 non-null  object 
 9   opto           8825 non-null   object 
 10  state          24865 non-null  int64  
 11  orgid          24865 non-null  int64  
 12  jobid          24865 non-null  int64  
 13  adbusign       24865 non-null  int64  
 14  townsign       24865 non-null  int64  
 15  regtype        24865 non-null  int64  
 16  empnum         19615 non-null  float64
 17  compform       10631 non-null  float64
 18  parnum

In [5]:
annual_report_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22550 entries, 0 to 22549
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             22550 non-null  object 
 1   ANCHEYEAR      22550 non-null  float64
 2   STATE          22545 non-null  float64
 3   FUNDAM         5702 non-null   float64
 4   MEMNUM         29 non-null     float64
 5   FARNUM         29 non-null     float64
 6   ANNNEWMEMNUM   29 non-null     float64
 7   ANNREDMEMNUM   29 non-null     float64
 8   EMPNUM         22535 non-null  float64
 9   EMPNUMSIGN     16833 non-null  float64
 10  BUSSTNAME      17680 non-null  object 
 11  COLGRANUM      20041 non-null  float64
 12  RETSOLNUM      20041 non-null  float64
 13  DISPERNUM      20041 non-null  float64
 14  UNENUM         20041 non-null  float64
 15  COLEMPLNUM     20041 non-null  float64
 16  RETEMPLNUM     20041 non-null  float64
 17  DISEMPLNUM     20041 non-null  float64
 18  UNEEMP

In [6]:
tax_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29195 entries, 0 to 29194
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              29195 non-null  object 
 1   START_DATE      29195 non-null  object 
 2   END_DATE        29195 non-null  object 
 3   TAX_CATEGORIES  29195 non-null  object 
 4   TAX_ITEMS       29195 non-null  object 
 5   TAXATION_BASIS  25816 non-null  float64
 6   TAX_RATE        25816 non-null  float64
 7   DEDUCTION       24235 non-null  float64
 8   TAX_AMOUNT      29195 non-null  float64
dtypes: float64(4), object(5)
memory usage: 2.0+ MB


In [7]:
change_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45940 entries, 0 to 45939
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      45940 non-null  object 
 1   bgxmdm  45940 non-null  float64
 2   bgq     45940 non-null  object 
 3   bgh     45940 non-null  object 
 4   bgrq    45940 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.8+ MB


In [8]:
news_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10518 entries, 0 to 10517
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                10518 non-null  object
 1   positive_negtive  10518 non-null  object
 2   public_date       10518 non-null  object
dtypes: object(3)
memory usage: 246.6+ KB


In [9]:
other_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1890 entries, 0 to 1889
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  1890 non-null   object 
 1   legal_judgment_num  1006 non-null   float64
 2   brand_num           909 non-null    float64
 3   patent_num          396 non-null    float64
dtypes: float64(3), object(1)
memory usage: 59.2+ KB


如上述所言，除了企业基本信息外，其余数据集企业空缺都比较大，所以最先考虑只用base_info一个数据集来建模。

In [10]:
data_set = base_info.copy()

## 数据预处理

查看base_info各特征缺失率，并将缺失率超过0.5的特征删除

In [11]:
for fea in base_info.columns: 
    empty_rate = base_info.isnull().sum()[fea]/len(base_info)
    print('{} empty rate: {}'.format(fea, empty_rate))
    if empty_rate > 0.5:
        data_set.drop(fea, axis=1, inplace=True)
        print('[remove {}]'.format(fea))

id empty rate: 0.0
oplocdistrict empty rate: 0.0
industryphy empty rate: 0.0
industryco empty rate: 4.021717273275689e-05
dom empty rate: 0.0
opscope empty rate: 0.0
enttype empty rate: 0.0
enttypeitem empty rate: 0.33034385682686507
opfrom empty rate: 0.0
opto empty rate: 0.6450834506334204
[remove opto]
state empty rate: 0.0
orgid empty rate: 0.0
jobid empty rate: 0.0
adbusign empty rate: 0.0
townsign empty rate: 0.0
regtype empty rate: 0.0
empnum empty rate: 0.21114015684697365
compform empty rate: 0.5724512366780615
[remove compform]
parnum empty rate: 0.9059320329780817
[remove parnum]
exenum empty rate: 0.944580735974261
[remove exenum]
opform empty rate: 0.638045445405188
[remove opform]
ptbusscope empty rate: 1.0
[remove ptbusscope]
venind empty rate: 0.6606877136537301
[remove venind]
enttypeminu empty rate: 0.7076211542328574
[remove enttypeminu]
midpreindcode empty rate: 1.0
[remove midpreindcode]
protype empty rate: 0.9986326161270863
[remove protype]
oploc empty rate: 0.0


查看拥有唯一值的特征

In [12]:
unique_val = [fea for fea in data_set.columns if data_set[fea].nunique() <= 1]
unique_val

[]

没有具有唯一值的特征，查看当前base_info信息

In [13]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             24865 non-null  object 
 1   oplocdistrict  24865 non-null  int64  
 2   industryphy    24865 non-null  object 
 3   industryco     24864 non-null  float64
 4   dom            24865 non-null  object 
 5   opscope        24865 non-null  object 
 6   enttype        24865 non-null  int64  
 7   enttypeitem    16651 non-null  float64
 8   opfrom         24865 non-null  object 
 9   state          24865 non-null  int64  
 10  orgid          24865 non-null  int64  
 11  jobid          24865 non-null  int64  
 12  adbusign       24865 non-null  int64  
 13  townsign       24865 non-null  int64  
 14  regtype        24865 non-null  int64  
 15  empnum         19615 non-null  float64
 16  oploc          24865 non-null  object 
 17  regcap         24674 non-null  float64
 18  enttyp

特征数据类型划分

In [14]:
num_fea = list(data_set.select_dtypes(exclude=['object']).columns)
num_fea

['oplocdistrict',
 'industryco',
 'enttype',
 'enttypeitem',
 'state',
 'orgid',
 'jobid',
 'adbusign',
 'townsign',
 'regtype',
 'empnum',
 'regcap',
 'enttypegb']

In [15]:
obj_fea = list(filter(lambda x: x not in num_fea,list(data_set.columns)))
obj_fea

['id', 'industryphy', 'dom', 'opscope', 'opfrom', 'oploc']

这一步我们需要做的是将对象型特征转化为数值型，以便后续缺失值的填补以及其他操作。

首先我们要了解这几个对象型特征的含义，id不用管，该特征对最后建模没有影响；industryphy为行业类别代码，dom为经营地址；opscope为经营范围；opfrom为经营期限起，oploc为经营场所

查看这几个属性的具体特征值

In [16]:
data_set[obj_fea].head()

Unnamed: 0,id,industryphy,dom,opscope,opfrom,oploc
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,M,31487d8f256f16bd6244b7251be2ebb24d1db51663c654...,纳米新材料、机械设备、五金配件加工、销售及技术推广服务，道路货物运输。（依法须经批准的项目，...,2019-07-11 00:00:00,2367b4cac96d8598
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,O,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,健身服务。（依法须经批准的项目，经相关部门批准后方可开展经营活动）,2017-09-06,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,R,31487d8f256f16bd6244b7251be2ebb2ae36cd652943e8...,文化娱乐经纪人服务；境内文艺活动组织与策划；文化艺术交流活动组织策划；演出经纪；其他文化艺术...,2020-09-14 14:46:30,2367b4cac96d8598
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,L,746df9aaed8578571760c563abe882c8ba25209fc6d5db...,投资管理及咨询(证券、期货除外)；企业管理。（依法须经批准的项目，经相关部门批准后方可开展经...,2015-09-30,2367b4cac96d8598
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,R,31487d8f256f16bd6244b7251be2ebb2ae36cd652943e8...,境内文化艺术交流活动策划；企业形象策划；礼仪庆典服务；翻译服务；专利代理；广告设计、制作、代...,2017-12-01,2367b4cac96d8598


- 对industryphy进行处理

In [17]:
data_set['industryphy'].unique()

array(['M', 'O', 'R', 'L', 'P', 'J', 'Q', 'N', 'F', 'E', 'C', 'K', 'D',
       'I', 'S', 'G', 'A', 'T', 'H', 'B'], dtype=object)

可知该特征值是离散的，且相互之间不存在大小关系，采用one-hot映射

In [18]:
data_set = data_set.join(pd.get_dummies(data_set['industryphy']))
data_set.drop('industryphy', axis=1, inplace=True)
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             24865 non-null  object 
 1   oplocdistrict  24865 non-null  int64  
 2   industryco     24864 non-null  float64
 3   dom            24865 non-null  object 
 4   opscope        24865 non-null  object 
 5   enttype        24865 non-null  int64  
 6   enttypeitem    16651 non-null  float64
 7   opfrom         24865 non-null  object 
 8   state          24865 non-null  int64  
 9   orgid          24865 non-null  int64  
 10  jobid          24865 non-null  int64  
 11  adbusign       24865 non-null  int64  
 12  townsign       24865 non-null  int64  
 13  regtype        24865 non-null  int64  
 14  empnum         19615 non-null  float64
 15  oploc          24865 non-null  object 
 16  regcap         24674 non-null  float64
 17  enttypegb      24865 non-null  int64  
 18  A     

- 对dom进行处理

In [19]:
len(data_set['dom'].unique())

23278

该特征暂时没有好的转换方式，故采用直接映射的方法

In [20]:
mapping = {}
index = data_set['dom'].unique()
for i in range(len(index)):
    mapping[index[i]]=i
data_set['dom'] = data_set['dom'].map(mapping)

- 对opscope处理

该特征由汉字构成，暂时能想到的方法就是词频统计，但比较麻烦，所以这里先不对其进行处理，直接去掉该特征

In [21]:
data_set.drop('opscope', axis=1, inplace=True)

- 对opfrom进行处理

可以将该特征转化为与某一特定时间之间的时间差。同时观察数据可以得知，大多数企业的经营区间都是50年，由此可以大致推算填补opto的缺失值，不过这里先不实现，等提交过后再来进行该处理看分数是否有提高。

In [22]:
data_set['opfrom'] = pd.to_datetime(data_set['opfrom'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2020-10-13', '%Y-%m-%d')
data_set['opfromDays'] = data_set['opfrom'].apply(lambda x: startdate-x).dt.days
data_set.drop('opfrom', axis=1, inplace=True)

- 对oploc进行处理

做和dom一样的处理，直接映射

In [23]:
mapping = {}
index = data_set['oploc'].unique()
for i in range(len(index)):
    mapping[index[i]]=i
data_set['oploc'] = data_set['oploc'].map(mapping)

至此特征类型的简单编码就完成了，查看处理后的数据

In [24]:
data_set.head()

Unnamed: 0,id,oplocdistrict,industryco,dom,enttype,enttypeitem,state,orgid,jobid,adbusign,townsign,regtype,empnum,oploc,regcap,enttypegb,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,opfromDays
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,7513.0,0,1100,1150.0,6,340223010010000000,340200000000115392,0,0,1,5.0,0,50.0,1151,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,460
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,8090.0,1,9600,,6,340222060010000000,340200000000112114,0,1,1,3.0,1,10.0,9600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1133
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,340202,9053.0,2,1100,1150.0,6,340202010010000000,400000000000753910,0,0,1,2.0,0,100.0,1151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,28
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,340221,7212.0,3,4500,4540.0,6,340221010010000000,400000000000013538,0,1,1,2.0,0,10.0,4540,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1840
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,340202,8810.0,4,1100,1130.0,7,340200000000000000,400000000000283237,0,0,1,,0,100.0,1130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1047


经观察，enttype，enttypeitem，enttypeminu和enttypegb，四个特征都是对企业类型的描述，前三个从左到右细分程度依次增高，而enttypegb包含了前三者的全部数据，所以前三个特征可以去除

In [25]:
data_set.drop(['enttype','enttypeitem'], axis=1, inplace=True)

查看个特征缺失数量

In [26]:
data_set.isnull().sum()

id                  0
oplocdistrict       0
industryco          1
dom                 0
state               0
orgid               0
jobid               0
adbusign            0
townsign            0
regtype             0
empnum           5250
oploc               0
regcap            191
enttypegb           0
A                   0
B                   0
C                   0
D                   0
E                   0
F                   0
G                   0
H                   0
I                   0
J                   0
K                   0
L                   0
M                   0
N                   0
O                   0
P                   0
Q                   0
R                   0
S                   0
T                   0
opfromDays          0
dtype: int64

用平均数填补缺失值

In [27]:
empty_fea = ['industryco','empnum','regcap']
data_set[empty_fea] = data_set[empty_fea].fillna(data_set[empty_fea].median())

## 建模

分离训练集和验证集

In [28]:
train_set = data_set.merge(entprise_info)
train_data = train_set.drop(['id','label'], axis=1)
train_label = train_set['label']
test_set = data_set.merge(entprise_evaluate)
test_data = test_set.drop(['id','score'], axis=1)
train_data.shape, test_data.shape

((14865, 34), (10000, 34))

定义评估函数，通过k折验证得到不同模型的分数，以便调参

In [29]:
def evaluateModel(model,x,y):
    mean_f1=0
    folds=5
    sk = StratifiedKFold(n_splits=folds, shuffle=True, random_state=2020)
    
    for trn_idx, val_idx in sk.split(x, y):
        trn_x, trn_y, val_x, val_y = x.iloc[trn_idx], y[trn_idx], x.iloc[val_idx], y[val_idx]

        model.fit(trn_x, trn_y)
        val_pred = model.predict(val_x)
        mean_f1 += f1_score(val_y, val_pred)/sk.n_splits
        
    return mean_f1

简单训练随机森林，xgboost，lightgbm，catboost四个模型，得到对应的分数

In [30]:
rf = RandomForestClassifier(oob_score=True, 
                            random_state=2020,
                            n_estimators= 70,
                            max_depth=12,
                            min_samples_split=5)

xlf = xgb.XGBClassifier(max_depth=7,
                      learning_rate=0.02,
                      n_estimators=75,
                      reg_alpha=0.005,
                      n_jobs=8,
                      importance_type='total_cover')

llf = lgb.LGBMClassifier(num_leaves=9,
                       max_depth=7,
                       learning_rate=0.05,
                       n_estimators=80,
                       n_jobs=8)
  
clf = cat.CatBoostClassifier(iterations=75,
                           learning_rate=0.05,
                           depth=10,
                           silent=True,
                           thread_count=8,
                           task_type='CPU')

print('rf:',evaluateModel(rf,train_data,train_label))
print('xlf:',evaluateModel(xlf,train_data,train_label))
print('llf:',evaluateModel(llf,train_data,train_label))
print('clf:',evaluateModel(clf,train_data,train_label)) 

rf: 0.8177939123167672
xlf: 0.8235263832132351
llf: 0.8210400630571153
clf: 0.8197626141028068


采用加权平均的方法对四个模型进行模型融合

In [31]:
mean_f1=0
folds=5
sk = StratifiedKFold(n_splits=folds, shuffle=True, random_state=2020)
results = []

for idx, (trn_idx, val_idx) in enumerate(sk.split(train_data, train_label)):
    trn_x, trn_y, val_x, val_y = train_data.iloc[trn_idx], train_label[trn_idx], train_data.iloc[val_idx], train_label[val_idx]
    
    rf.fit(trn_x, trn_y)
    rf_pred = rf.predict(val_x)
    rf_prob = rf.predict_proba(val_x)
    rf_weight = f1_score(val_y, rf_pred)
    
    xlf.fit(trn_x, trn_y)
    xgb_pred = xlf.predict(val_x)
    xgb_prob = xlf.predict_proba(val_x)
    xgb_weight = f1_score(val_y, xgb_pred)

    llf.fit(trn_x, trn_y)
    lgb_pred = llf.predict(val_x)
    lgb_prob = llf.predict_proba(val_x)
    lgb_weight = f1_score(val_y, lgb_pred)

    clf.fit(trn_x, trn_y)
    cat_pred = clf.predict(val_x)
    cat_prob = clf.predict_proba(val_x)
    cat_weight = f1_score(val_y, cat_pred)
    
    #暴力搜索最佳权重
    weight = np.arange(0, 1.05, 0.1)
    maxscore = 0
    optweight = ()
    for i in weight:
        for j in weight[weight <= (1 - i)]:
            for k in weight[weight <= (1 - i - j)]:
                prob_weight = rf_prob*i + xgb_prob*j + lgb_prob*k + cat_prob*(1 - i - j - k)
                score = f1_score(val_y, np.argmax(prob_weight,axis=1))
                if score > maxscore:
                    maxscore = score
                    optweight = (i, j, k, 1-i-j-k)
    print('第{}次验证f1_score：{}'.format(idx + 1, maxscore))
    print('权重为rf, xgb, lgb, cat：', optweight)
    mean_f1+=maxscore/sk.n_splits
    
    test_rf = rf.predict_proba(test_data)
    test_xgb = xlf.predict_proba(test_data)
    test_lgb = llf.predict_proba(test_data)
    test_cat = clf.predict_proba(test_data)
    test_pred = test_rf*optweight[0] + test_xgb*optweight[1] + test_lgb*optweight[2] + test_cat*optweight[3]
    results.append(test_pred)
print('线上验证f1_score: ', mean_f1)

第1次验证f1_score：0.8431876606683805
权重为rf, xgb, lgb, cat： (0.4, 0.0, 0.4, 0.19999999999999996)
第2次验证f1_score：0.8454106280193237
权重为rf, xgb, lgb, cat： (0.0, 0.6000000000000001, 0.0, 0.3999999999999999)
第3次验证f1_score：0.8186274509803921
权重为rf, xgb, lgb, cat： (0.6000000000000001, 0.1, 0.1, 0.19999999999999993)
第4次验证f1_score：0.8337349397590361
权重为rf, xgb, lgb, cat： (0.0, 0.1, 0.6000000000000001, 0.29999999999999993)
第5次验证f1_score：0.8264058679706601
权重为rf, xgb, lgb, cat： (0.0, 1.0, 0.0, 0.0)
线上验证f1_score:  0.8334733094795586


In [32]:
result = (sum(results)/sk.n_splits)[:,1]
submit_file = pd.DataFrame({'id': test_set['id'], 'score': result.tolist()})
submit_file

Unnamed: 0,id,score
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,0.044001
1,da8691b210adb3f67820f5e0c87b337d63112cee52211888,0.040906
2,9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17,0.041126
3,f000950527a6feb6ed308bc4c7ae11276eab86480f8e03db,0.045363
4,f000950527a6feb617e8d6ca7025dcf9d765429969122069,0.044508
...,...,...
9995,f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325,0.077206
9996,f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda,0.577968
9997,da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7,0.065357
9998,516ab81418ed215dcbbf0614a7b929e691f8eed153d7bb31,0.077457


In [33]:
submit_file.to_csv('submit01.csv', index=0)