# 工具包导入&数据读取
## 工具包导入

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.cross_validation import train_test_split
import gc
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline 



In [2]:
%pwd

'/home/admin/Aliyun/Alibaba-3rd-Security-Algorithm-Challenge-master'

## 数据读取
- 为了方便分析，我们读取3000万条数据进行处理

In [3]:
# path = '../final_input/'
# train = pd.read_csv(path + 'final_train.csv',nrows=1000000)
# test = pd.read_csv(path + 'final_test.csv',nrows=1000000)

In [4]:

train = pd.read_csv('../train.csv')
#test = pd.read_csv(path + 'final_test.csv')

# 特征工程 & 验证结果(1-Gram)

In [5]:
train_data = train[['file_id','label']].drop_duplicates()
train_data.head()

Unnamed: 0,file_id,label
0,0,0
424,1,5
426,2,5
460,3,5
3260,4,5


In [6]:
train_data['label'].value_counts()

0    111545
5      3397
2       744
3       598
1       287
4        53
Name: label, dtype: int64

## 全局特征:
- File_id (Api): count,nunique
- File_id (Tid): count,nunique,max,min,quantile(20,40,50,60,80),std,range
- File_id (Return Value): count,nunique,max,min,quantile(20,40,50,60,80),std,range
- File_id (Index): count,nunique,max,min,quantile(20,40,50,60,80),std,range

### File_id (Api): count,nunique

In [7]:
train.head()

Unnamed: 0,file_id,label,api,tid,return_value,index
0,0,0,GetSystemTimeAsFileTime,2644,0,0
1,0,0,NtAllocateVirtualMemory,2644,0,1
2,0,0,NtFreeVirtualMemory,2644,0,2
3,0,0,NtAllocateVirtualMemory,2644,0,3
4,0,0,NtAllocateVirtualMemory,2644,0,4


In [8]:
api_opt = ['count','nunique'] 
for opt in api_opt:
    print(opt)
    tmp = train.groupby(['file_id'])['api'].agg({'fileid_api_' + opt: opt}).reset_index() 
    train_data = pd.merge(train_data,tmp,how='left', on='file_id')  

count


is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


nunique


In [9]:
train_data.head()

Unnamed: 0,file_id,label,fileid_api_count,fileid_api_nunique
0,0,0,424,19
1,1,5,2,2
2,2,5,34,15
3,3,5,2800,65
4,4,5,6832,78


### File_id (Tid): count,nunique,max,min,quantile(20,40,50,60,80),std,range

In [10]:
tid_opt = ['count','nunique','max','min','median','std'] 
for opt in tid_opt:
    print(opt)
    tmp = train.groupby(['file_id'])['tid'].agg({'fileid_tid_' + opt: opt}).reset_index() 
    train_data = pd.merge(train_data,tmp,how='left', on='file_id') 

count


is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


nunique
max
min
median
std


In [11]:
secs = [0.2,0.4,0.6,0.8]
for sec in secs: 
    train_data['fileid_tid_quantile_' + str(sec * 100)] = train.groupby(['file_id'])['tid'].quantile(sec).values
 
train_data['fileid_tid_range'] = train.groupby(['file_id'])['tid'].quantile(0.975).values - train.groupby(['file_id'])['tid'].quantile(0.0125).values

### File_id (Index): count,nunique,max,min,quantile(20,40,50,60,80),std,range

In [12]:
index_opt = ['count','nunique','max','min','median','std'] 
for opt in index_opt:
    print(opt)
    tmp = train.groupby(['file_id'])['index'].agg({'fileid_index_' + opt: opt}).reset_index() 
    train_data = pd.merge(train_data,tmp,how='left', on='file_id') 

count


is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


nunique
max
min
median
std


In [13]:
secs = [0.2,0.4,0.6,0.8]
for sec in secs: 
    train_data['fileid_index_quantile_' + str(sec * 100)] = train.groupby(['file_id'])['index'].quantile(sec).values
 
train_data['fileid_index_range'] = train.groupby(['file_id'])['index'].quantile(0.975).values - train.groupby(['file_id'])['index'].quantile(0.0125).values

### 全局特征的线下验证 <font color=red>( 0.0969482)</font>

#### 评估指标

In [24]:
def lgb_logloss(preds,data):
    labels_ = data.get_label()
    classes_ = np.unique(labels_) 
    preds_prob = []
    for i in range(len(classes_)):
        preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)])
    preds_prob_ = np.vstack(preds_prob) 
    
    loss = [] 
    for i in range(preds_prob_.shape[1]):  # 样本个数
        sum_ = 0  
        for j in range(preds_prob_.shape[0]): #类别个数
            pred = preds_prob_[j,i] # 第i个样本预测为第j类的概率
            if  j == labels_[i]:
                sum_ += np.log(pred)
            else:
                sum_ += np.log(1 - pred) 
             
        loss.append(sum_)  
         
    return 'loss is: ' ,-1 * (np.sum(loss) / preds_prob_.shape[1]),False

#### 训练特征 & 标签

In [17]:
train_features = [col for col in train_data.columns if col!='label' and col!='file_id']
train_label = 'label'

In [19]:
train_X, test_X, train_Y, test_Y = train_test_split( train_data[train_features],train_data[train_label].values, test_size = 0.33) 
#del _
gc.collect()

train_ind = train_X.index
test_ind = test_X.index

In [26]:
dtrain = lgb.Dataset(train_X,train_Y) 
dval   = lgb.Dataset(test_X,test_Y, reference = dtrain) 

params = {
         'task':'train', 
         'num_leaves': 255,
         'objective': 'multiclass',
         'num_class':6,
        #'min_data_in_leaf': 40,
         'min_data_in_leaf': 1,
        'learning_rate': 0.05,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.9,
         'bagging_freq': 5, 
         'max_bin':128,
        'num_threads': 10,
        'random_state':100
     }  
lgb_model_0_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  

[1]	training's multi_logloss: 1.64923	training's loss is: : 2.53026	valid_1's multi_logloss: 1.65107	valid_1's loss is: : 2.53251
Training until validation scores don't improve for 50 rounds.
[2]	training's multi_logloss: 1.52682	training's loss is: : 2.37802	valid_1's multi_logloss: 1.53022	valid_1's loss is: : 2.3822
[3]	training's multi_logloss: 1.4199	training's loss is: : 2.24201	valid_1's multi_logloss: 1.42483	valid_1's loss is: : 2.24816
[4]	training's multi_logloss: 1.32484	training's loss is: : 2.11848	valid_1's multi_logloss: 1.33108	valid_1's loss is: : 2.12637
[5]	training's multi_logloss: 1.23958	training's loss is: : 2.00539	valid_1's multi_logloss: 1.24696	valid_1's loss is: : 2.01485
[6]	training's multi_logloss: 1.16259	training's loss is: : 1.90132	valid_1's multi_logloss: 1.17106	valid_1's loss is: : 1.91228
[7]	training's multi_logloss: 1.09247	training's loss is: : 1.80475	valid_1's multi_logloss: 1.10203	valid_1's loss is: : 1.81724
[8]	training's multi_logloss: 

[62]	training's multi_logloss: 0.0852451	training's loss is: : 0.165758	valid_1's multi_logloss: 0.124617	valid_1's loss is: : 0.230288
[63]	training's multi_logloss: 0.0819673	training's loss is: : 0.159547	valid_1's multi_logloss: 0.121804	valid_1's loss is: : 0.224988
[64]	training's multi_logloss: 0.0789316	training's loss is: : 0.153722	valid_1's multi_logloss: 0.119187	valid_1's loss is: : 0.220008
[65]	training's multi_logloss: 0.0760245	training's loss is: : 0.148135	valid_1's multi_logloss: 0.116649	valid_1's loss is: : 0.215183
[66]	training's multi_logloss: 0.0732493	training's loss is: : 0.14279	valid_1's multi_logloss: 0.114253	valid_1's loss is: : 0.210623
[67]	training's multi_logloss: 0.07047	training's loss is: : 0.137531	valid_1's multi_logloss: 0.112049	valid_1's loss is: : 0.206411
[68]	training's multi_logloss: 0.0679192	training's loss is: : 0.132615	valid_1's multi_logloss: 0.109908	valid_1's loss is: : 0.202334
[69]	training's multi_logloss: 0.0654035	training's

[122]	training's multi_logloss: 0.0138144	training's loss is: : 0.0274836	valid_1's multi_logloss: 0.0746849	valid_1's loss is: : 0.136385
[123]	training's multi_logloss: 0.0135065	training's loss is: : 0.0268745	valid_1's multi_logloss: 0.0746788	valid_1's loss is: : 0.1364
[124]	training's multi_logloss: 0.0132088	training's loss is: : 0.0262854	valid_1's multi_logloss: 0.0746122	valid_1's loss is: : 0.136295
[125]	training's multi_logloss: 0.0129286	training's loss is: : 0.0257307	valid_1's multi_logloss: 0.0746309	valid_1's loss is: : 0.136343
[126]	training's multi_logloss: 0.0126519	training's loss is: : 0.0251832	valid_1's multi_logloss: 0.074629	valid_1's loss is: : 0.136374
[127]	training's multi_logloss: 0.0123726	training's loss is: : 0.0246301	valid_1's multi_logloss: 0.0745697	valid_1's loss is: : 0.136293
[128]	training's multi_logloss: 0.0121043	training's loss is: : 0.0240989	valid_1's multi_logloss: 0.0745343	valid_1's loss is: : 0.136258
[129]	training's multi_logloss

In [28]:
def runXGB(train_X,train_y,test_X,test_y=None,feature_names=None,seed_val=0,num_rounds=1000):
    #参数设定
    param = {}
    param['objective'] = 'multi:softprob'#多分类、输出概率值
    param['eta'] = 0.1#学习率
    param['max_depth'] = 6#最大深度，越大越容易过拟合
    param['silent'] = 1#打印提示信息
    param['num_class'] = 6#三个类别
    param['eval_metric']= "mlogloss"#对数损失
    param['min_child_weight']=1#停止条件，这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    param['subsample'] =0.7#随机采样训练样本
    param['colsample_bytree'] = 0.7# 生成树时进行的列采样
    param['seed'] = seed_val#随机数种子
    num_rounds = num_rounds#迭代次数
    
    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X,label=train_y)
    
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X,label=test_y)
        watchlist = [(xgtrain,'train'),(xgtest,'test')]
        model = xgb.train(plst,xgtrain,num_rounds,watchlist,early_stopping_rounds=20)
      #  early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst,xgtrain,num_rounds)
    pred_test_y = model.predict(xgtest)
    return pred_test_y,model


In [29]:
from sklearn.cross_validation import StratifiedKFold
import xgboost as xgb
runXGB(train_X,train_Y,test_X,test_Y)

[0]	train-mlogloss:1.52184	test-mlogloss:1.52155
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.32143	test-mlogloss:1.32118
[2]	train-mlogloss:1.16268	test-mlogloss:1.16262
[3]	train-mlogloss:1.03308	test-mlogloss:1.03296
[4]	train-mlogloss:0.924021	test-mlogloss:0.924054
[5]	train-mlogloss:0.830729	test-mlogloss:0.830848
[6]	train-mlogloss:0.750349	test-mlogloss:0.750585
[7]	train-mlogloss:0.680013	test-mlogloss:0.680375
[8]	train-mlogloss:0.61866	test-mlogloss:0.619101
[9]	train-mlogloss:0.564536	test-mlogloss:0.56511
[10]	train-mlogloss:0.516592	test-mlogloss:0.517349
[11]	train-mlogloss:0.473969	test-mlogloss:0.474918
[12]	train-mlogloss:0.435977	test-mlogloss:0.437048
[13]	train-mlogloss:0.401491	test-mlogloss:0.402744
[14]	train-mlogloss:0.371106	test-mlogloss:0.372446
[15]	train-mlogloss:0.34342	test-mlogloss:0.344913
[16]	train-mlogloss:0.318787	test-mlog

[155]	train-mlogloss:0.04065	test-mlogloss:0.074577
[156]	train-mlogloss:0.040467	test-mlogloss:0.074581
[157]	train-mlogloss:0.040264	test-mlogloss:0.074575
[158]	train-mlogloss:0.040033	test-mlogloss:0.074495
[159]	train-mlogloss:0.039858	test-mlogloss:0.074447
[160]	train-mlogloss:0.039686	test-mlogloss:0.074389
[161]	train-mlogloss:0.039512	test-mlogloss:0.07439
[162]	train-mlogloss:0.039348	test-mlogloss:0.074371
[163]	train-mlogloss:0.039156	test-mlogloss:0.074328
[164]	train-mlogloss:0.039004	test-mlogloss:0.074308
[165]	train-mlogloss:0.038798	test-mlogloss:0.074262
[166]	train-mlogloss:0.038581	test-mlogloss:0.074218
[167]	train-mlogloss:0.038408	test-mlogloss:0.074178
[168]	train-mlogloss:0.038176	test-mlogloss:0.074134
[169]	train-mlogloss:0.038014	test-mlogloss:0.074109
[170]	train-mlogloss:0.037909	test-mlogloss:0.074093
[171]	train-mlogloss:0.037743	test-mlogloss:0.074109
[172]	train-mlogloss:0.037574	test-mlogloss:0.074064
[173]	train-mlogloss:0.037392	test-mlogloss:0.07

(array([[9.9572819e-01, 1.6203585e-05, 1.6343502e-03, 8.3550818e-05,
         4.8396148e-05, 2.4893144e-03],
        [9.9548537e-01, 2.3472287e-06, 4.8043585e-05, 4.9336686e-06,
         2.5823115e-06, 4.4566989e-03],
        [9.9947113e-01, 1.4132763e-05, 3.6072724e-05, 1.2409997e-05,
         7.6228835e-06, 4.5866182e-04],
        ...,
        [9.9464637e-01, 5.6236811e-05, 6.2921160e-04, 2.3285649e-04,
         4.9585480e-05, 4.3857256e-03],
        [9.9870121e-01, 3.7009111e-05, 6.2678810e-05, 2.5975469e-05,
         2.1061803e-06, 1.1710105e-03],
        [9.9572951e-01, 2.2936781e-04, 1.5375386e-03, 3.2207929e-04,
         7.7494398e-05, 2.1040195e-03]], dtype=float32),
 <xgboost.core.Booster at 0x7efc5d4e1b00>)

### 全局特征扩充
- File_id + return_value分段：计数

## 局部组合特征(展开形式)
### File_id + Api  
- File_id + Api (tid): count,nunique
- File_id + Api (return value): nunique, max, min, median, std
- File_id + Api (index):  nunique, max, min, median, std



#### File_id + Api (tid): count,nunique

In [30]:
def groupby_pivot_features(data_merge, data_orig , groupby_features,col1 = None, col2 = None, opts = None):
    for opt in opts:
        print(opt)
        train_split = data_orig.groupby(['file_id',col1])[col2].agg({'fileid_' + col1 + '_'+col2+'_'+ str(opt):opt}).reset_index() 
        
        train_split_ =  pd.pivot_table(train_split, values = 'fileid_' + col1 + '_'+col2+'_'+ str(opt), index=['file_id'],columns=[col1])
        new_cols = [ 'fileid_' + col1 + '_'+col2+  '_' + opt + '_' + str(col) for col in train_split_.columns]
        
        groupby_features.append(new_cols)
        train_split_.columns = new_cols 

        train_split_.reset_index(inplace = True)
        
        data_merge = pd.merge(data_merge,train_split_,how='left', on='file_id') 
    return data_merge,groupby_features 
    

In [31]:
groupby_features = []
api_opts = ['count', 'nunique']
train_data_,groupby_features = groupby_pivot_features(train_data, train, groupby_features, col1 = 'api', col2 = 'tid', opts = api_opts)

count


is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


nunique


#### File_id + Api (return value): nunique, max, min, median, std

In [32]:
# api_opts = ['nunique','max','min','median','std']
# train_data_,groupby_features = groupby_pivot_features(train_data_, train, groupby_features, col1 = 'api', col2 = 'return_value', opts = api_opts) 

####  File_id + Api(index): nunique, max, min, median, std

In [33]:
api_opts = ['nunique','max','min','median','std']
train_data_,groupby_features = groupby_pivot_features(train_data_, train, groupby_features, col1 = 'api', col2 = 'index', opts = api_opts) 

nunique


is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


max
min
median
std


In [34]:
train_data_.head()

Unnamed: 0,file_id,label,fileid_api_count,fileid_api_nunique,fileid_tid_count,fileid_tid_nunique,fileid_tid_max,fileid_tid_min,fileid_tid_median,fileid_tid_std,...,fileid_api_index_std_recv,fileid_api_index_std_recvfrom,fileid_api_index_std_select,fileid_api_index_std_send,fileid_api_index_std_sendto,fileid_api_index_std_setsockopt,fileid_api_index_std_shutdown,fileid_api_index_std_socket,fileid_api_index_std_system,fileid_api_index_std_timeGetTime
0,0,0,424,19,424,1,2644,2644,2644,0.0,...,,,,,,,,,,
1,1,5,2,2,2,1,2524,2524,2524,0.0,...,,,,,,,,,,
2,2,5,34,15,34,1,2516,2516,2516,0.0,...,,,,,,,,,,
3,3,5,2800,65,2800,5,2884,2508,2884,170.76408,...,,,,,,,,24.193663,,
4,4,5,6832,78,6832,6,2968,2060,2820,48.861741,...,,,,,,,,1013.848858,,


In [36]:
len(train_data.columns)

26

### 1阶特征的线下验证(File_id + Api)（<font color=red>0.0347293</font>）

### File_id + Index  
- File_id + Index (api): count,nunique
- File_id + Index (return value): nunique, max, min, median, std(暂时先搁置)
- File_id + Index (tid):  nunique, max, min, median, std(暂时先搁置)


#### File_id +Tid (api): count,nunique

#### File_id + Index特征过拟合，删除


In [37]:
# delcol = []
# for i in range(2):
#     for item in groupby_features2[i]:
#         delcol.append(item)

In [38]:
# train_data_.drop(delcol,axis=1,inplace=True)

## 特征补充（加入index的差值特征）
- File_id + Api (index_diff): 'nunique','max','min','median','std'

In [39]:
train_diff = train.groupby(['file_id','tid'])['index'].diff().fillna(-999).values

In [40]:
train['index_diff'] = train_diff

In [41]:
train_diff = train.loc[train.index_diff!=-999] 

In [42]:
api_opts = ['nunique','max','min','median','std']
train_data_,groupby_features = groupby_pivot_features(train_data_, train_diff, groupby_features, col1 = 'api', col2 = 'index_diff', opts = api_opts) 

nunique


is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


max
min
median
std


In [44]:
train_data_.shape

(116624, 3718)

In [54]:
def lgb_logloss(preds,data):
    labels_ = data.get_label()
    classes_ = np.unique(labels_) 
    preds_prob = []
    for i in range(len(classes_)):
        preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)])
    preds_prob_ = np.vstack(preds_prob) 
    
    loss = [] 
    for i in range(preds_prob_.shape[1]):  # 样本个数
        sum_ = 0  
        for j in range(preds_prob_.shape[0]): #类别个数
            pred = preds_prob_[j,i] # 第i个样本预测为第j类的概率
            if  j == labels_[i]:
                sum_ += np.log(pred)
            else:
                sum_ += np.log(1 - pred) 
             
        loss.append(sum_)  
         
    return 'loss is: ' ,-1 * (np.sum(loss) / preds_prob_.shape[1]),False

### 线下验证(<font color=red>0.0346954</font>)

In [57]:
train_features = [col for col in train_data_.columns if col!='label' and col!='file_id']
train_label = 'label'
print(type(train_features))
#print(len(train_features))
#runXGB(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values,train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values)
# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) 
# dval   = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) 

# params = {
#         'task':'train', 
#         'num_leaves': 255,
#         'objective': 'multiclass',
#         'num_class':6,
#         'min_data_in_leaf': 40,
#         'learning_rate': 0.05,
#         'feature_fraction': 0.85,
#         'bagging_fraction': 0.9,
#         'bagging_freq': 5, 
#         'max_bin':128,
#         'num_threads': 64,
#         'random_state':100
#     }  
# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  

<class 'list'>


### 删除quantile,std统计变量之后的验证(<font color=red>0.0350054</font>)  

In [30]:
# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id' and 'std' not in col and 'quantile' not in col]
# train_label = 'label'
# print(len(train_features))
# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) 
# dval   = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) 

# params = {
#         'task':'train', 
#         'num_leaves': 255,
#         'objective': 'multiclass',
#         'num_class':6,
#         'min_data_in_leaf': 40,
#         'learning_rate': 0.05,
#         'feature_fraction': 0.85,
#         'bagging_fraction': 0.9,
#         'bagging_freq': 5, 
#         'max_bin':128,
#         'num_threads': 64,
#         'random_state':100
#     }  
# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  

In [31]:
# train_data_.to_csv('/data/Data_JieZhang/TC_SAFE/train_val/train_data.csv',index = None) 

# 特征工程& 验证结果 2-Gram
## 全局特征
### File_id（Api_2）:count,nunique

In [None]:
train['api_shift'] = train['api'].shift(-1)
train['api_2'] = train['api'] +'_' + train['api_shift']

In [None]:
train.drop(['api_shift'],axis=1,inplace=True)

In [None]:
api_count = train['api_2'].value_counts()

In [None]:
api_opt = ['count','nunique'] 
for opt in api_opt:
    print(opt)
    tmp = train.groupby(['file_id'])['api_2'].agg({'fileid_api_2_' + opt: opt}).reset_index() 
    train_data_ = pd.merge(train_data_,tmp,how='left', on='file_id')  

## 局部特征
### File_id + tid (Api_2): count特征

In [36]:
api_value_counts = pd.DataFrame(api_count).reset_index()
api_value_counts.columns = ['api_2','api_2_count']

In [37]:
train = pd.merge(train, api_value_counts, on ='api_2' , how='left')

In [38]:
api_opts = ['count']
groupby_features =  []
train_data_,groupby_features = groupby_pivot_features(train_data_, train.loc[train.api_2_count>=20], groupby_features, col1 = 'api_2', col2 = 'tid', opts = api_opts)

count


is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


### 线下验证(<font color=red> 0.0330886</font>)

In [39]:
# train_features = [col for col in train_data_.columns if col!='label' and col!='file_id']
# train_label = 'label'
# print(len(train_features))
# dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) 
# dval   = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) 

# params = {
#         'task':'train', 
#         'num_leaves': 255,
#         'objective': 'multiclass',
#         'num_class':6,
#         'min_data_in_leaf': 40,
#         'learning_rate': 0.05,
#         'feature_fraction': 0.85,
#         'bagging_fraction': 0.9,
#         'bagging_freq': 5, 
#         'max_bin':128,
#         'num_threads': 64,
#         'random_state':100
#     }  
# lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  

### File_id + index (Api_2): max,min特征

In [40]:
train_features = [col for col in train_data_.columns if col!='label' and col!='file_id']
train_label = 'label'
print(len(train_features))
train_X, test_X, train_Y, test_Y = train_test_split( train_data_[train_features],train_data_[train_label].values, test_size = 0.33) 

dtrain = lgb.Dataset(train_X,train_Y) 
dval   = lgb.Dataset(test_X,test_Y, reference = dtrain) 

params = {
        'task':'train', 
        'num_leaves': 255,
        'objective': 'multiclass',
        'num_class':8,
        #'min_data_in_leaf': 40,
        'min_data_in_leaf': 10,
        'learning_rate': 0.05,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.9,
        'bagging_freq': 5, 
        'max_bin':128,
        'num_threads': 64,
        'random_state':100
    }  
lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  

12236
[1]	training's multi_logloss: 1.90825	training's loss is: : 2.81617	valid_1's multi_logloss: 1.92056	valid_1's loss is: : 2.83041
Training until validation scores don't improve for 50 rounds.
[2]	training's multi_logloss: 1.7664	training's loss is: : 2.64826	valid_1's multi_logloss: 1.78989	valid_1's loss is: : 2.67577
[3]	training's multi_logloss: 1.64441	training's loss is: : 2.50066	valid_1's multi_logloss: 1.67822	valid_1's loss is: : 2.54072
[4]	training's multi_logloss: 1.53808	training's loss is: : 2.36933	valid_1's multi_logloss: 1.58129	valid_1's loss is: : 2.42106
[5]	training's multi_logloss: 1.44341	training's loss is: : 2.25009	valid_1's multi_logloss: 1.49486	valid_1's loss is: : 2.31237
[6]	training's multi_logloss: 1.35794	training's loss is: : 2.14047	valid_1's multi_logloss: 1.41736	valid_1's loss is: : 2.21313
[7]	training's multi_logloss: 1.28042	training's loss is: : 2.03927	valid_1's multi_logloss: 1.3472	valid_1's loss is: : 2.12168
[8]	training's multi_log

[63]	training's multi_logloss: 0.109875	training's loss is: : 0.212393	valid_1's multi_logloss: 0.354111	valid_1's loss is: : 0.599118
[64]	training's multi_logloss: 0.105601	training's loss is: : 0.204351	valid_1's multi_logloss: 0.351429	valid_1's loss is: : 0.594374
[65]	training's multi_logloss: 0.101489	training's loss is: : 0.196593	valid_1's multi_logloss: 0.348967	valid_1's loss is: : 0.590063
[66]	training's multi_logloss: 0.0975043	training's loss is: : 0.189071	valid_1's multi_logloss: 0.346486	valid_1's loss is: : 0.585778
[67]	training's multi_logloss: 0.0936928	training's loss is: : 0.181861	valid_1's multi_logloss: 0.344046	valid_1's loss is: : 0.581524
[68]	training's multi_logloss: 0.0900635	training's loss is: : 0.17498	valid_1's multi_logloss: 0.341734	valid_1's loss is: : 0.577578
[69]	training's multi_logloss: 0.086594	training's loss is: : 0.16839	valid_1's multi_logloss: 0.33952	valid_1's loss is: : 0.573784
[70]	training's multi_logloss: 0.0833025	training's los

[123]	training's multi_logloss: 0.0108364	training's loss is: : 0.0215714	valid_1's multi_logloss: 0.326212	valid_1's loss is: : 0.561192
[124]	training's multi_logloss: 0.0104299	training's loss is: : 0.0207655	valid_1's multi_logloss: 0.326855	valid_1's loss is: : 0.56255
[125]	training's multi_logloss: 0.0100383	training's loss is: : 0.0199888	valid_1's multi_logloss: 0.327589	valid_1's loss is: : 0.564102
[126]	training's multi_logloss: 0.00966387	training's loss is: : 0.0192463	valid_1's multi_logloss: 0.328251	valid_1's loss is: : 0.565496
[127]	training's multi_logloss: 0.00930306	training's loss is: : 0.0185304	valid_1's multi_logloss: 0.329061	valid_1's loss is: : 0.567172
[128]	training's multi_logloss: 0.00895435	training's loss is: : 0.0178384	valid_1's multi_logloss: 0.329773	valid_1's loss is: : 0.568687
[129]	training's multi_logloss: 0.00862021	training's loss is: : 0.0171752	valid_1's multi_logloss: 0.330612	valid_1's loss is: : 0.570442
[130]	training's multi_logloss:

In [41]:
fea_imp = pd.DataFrame({'feature':train_features, 'imp':lgb_model_3_order.feature_importance()}).sort_values('imp')
important_features = fea_imp.loc[fea_imp.imp >=1, 'feature'].values
important_features = list(important_features)

important_features.append('file_id')
important_features.append('label')

In [42]:
# train_data_[important_features].to_csv('../feature_final/train_data_2gram.csv',index = None)

train_ind = train_X.index
test_ind = test_X.index

In [43]:
# len(important_features)

In [44]:
api_opts = ['max','min']
train_data_,groupby_features = groupby_pivot_features(train_data_[important_features], train.loc[train.api_2_count>=20], groupby_features, col1 = 'api_2', col2 = 'index', opts = api_opts)

max


is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


min


In [45]:
train_features = [col for col in train_data_.columns if col!='label' and col!='file_id' and 'std' not in col and 'quantile' not in col]
train_label = 'label'

train_ind = train_X.index
test_ind = test_X.index

dtrain = lgb.Dataset(train_data_.loc[train_ind,train_features],train_data_.loc[train_ind,train_label].values) 
dval   = lgb.Dataset(train_data_.loc[test_ind,train_features],train_data_.loc[test_ind,train_label].values, reference = dtrain) 

params = {
        'task':'train', 
        'num_leaves': 255,
        'objective': 'multiclass',
        'num_class':8,
        'min_data_in_leaf': 10,
        #'min_data_in_leaf': 1,
        'learning_rate': 0.05,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.9,
        'bagging_freq': 5, 
        'max_bin':128,
        'num_threads': 64,
        'random_state':100
    }  
lgb_model_3_order = lgb.train(params, dtrain, num_boost_round=500,valid_sets=[dtrain,dval], early_stopping_rounds=50, feval=lgb_logloss)  

[1]	training's multi_logloss: 1.90856	training's loss is: : 2.81654	valid_1's multi_logloss: 1.92087	valid_1's loss is: : 2.83077
Training until validation scores don't improve for 50 rounds.
[2]	training's multi_logloss: 1.76571	training's loss is: : 2.64745	valid_1's multi_logloss: 1.78921	valid_1's loss is: : 2.67497
[3]	training's multi_logloss: 1.64335	training's loss is: : 2.49943	valid_1's multi_logloss: 1.67637	valid_1's loss is: : 2.53858
[4]	training's multi_logloss: 1.53597	training's loss is: : 2.3668	valid_1's multi_logloss: 1.5782	valid_1's loss is: : 2.41745
[5]	training's multi_logloss: 1.44075	training's loss is: : 2.24685	valid_1's multi_logloss: 1.49107	valid_1's loss is: : 2.30783
[6]	training's multi_logloss: 1.3547	training's loss is: : 2.13643	valid_1's multi_logloss: 1.41293	valid_1's loss is: : 2.20775
[7]	training's multi_logloss: 1.27698	training's loss is: : 2.03491	valid_1's multi_logloss: 1.34204	valid_1's loss is: : 2.11535
[8]	training's multi_logloss: 1

[63]	training's multi_logloss: 0.109227	training's loss is: : 0.21106	valid_1's multi_logloss: 0.348992	valid_1's loss is: : 0.590537
[64]	training's multi_logloss: 0.104992	training's loss is: : 0.203088	valid_1's multi_logloss: 0.346622	valid_1's loss is: : 0.586319
[65]	training's multi_logloss: 0.100929	training's loss is: : 0.195426	valid_1's multi_logloss: 0.344024	valid_1's loss is: : 0.581803
[66]	training's multi_logloss: 0.0969796	training's loss is: : 0.18797	valid_1's multi_logloss: 0.341403	valid_1's loss is: : 0.577185
[67]	training's multi_logloss: 0.0931925	training's loss is: : 0.180804	valid_1's multi_logloss: 0.338987	valid_1's loss is: : 0.572995
[68]	training's multi_logloss: 0.0896026	training's loss is: : 0.173997	valid_1's multi_logloss: 0.336951	valid_1's loss is: : 0.569401
[69]	training's multi_logloss: 0.0861973	training's loss is: : 0.167526	valid_1's multi_logloss: 0.334829	valid_1's loss is: : 0.56571
[70]	training's multi_logloss: 0.0829002	training's lo

[123]	training's multi_logloss: 0.0108377	training's loss is: : 0.0215677	valid_1's multi_logloss: 0.323428	valid_1's loss is: : 0.55709
[124]	training's multi_logloss: 0.0104344	training's loss is: : 0.0207686	valid_1's multi_logloss: 0.324015	valid_1's loss is: : 0.558444
[125]	training's multi_logloss: 0.0100475	training's loss is: : 0.0200016	valid_1's multi_logloss: 0.324785	valid_1's loss is: : 0.560056
[126]	training's multi_logloss: 0.00967195	training's loss is: : 0.019257	valid_1's multi_logloss: 0.325642	valid_1's loss is: : 0.561782
[127]	training's multi_logloss: 0.00930444	training's loss is: : 0.0185283	valid_1's multi_logloss: 0.326291	valid_1's loss is: : 0.563197
[128]	training's multi_logloss: 0.00895312	training's loss is: : 0.0178314	valid_1's multi_logloss: 0.326888	valid_1's loss is: : 0.564504
[129]	training's multi_logloss: 0.0086164	training's loss is: : 0.0171633	valid_1's multi_logloss: 0.327696	valid_1's loss is: : 0.566191
[130]	training's multi_logloss: 0

In [46]:
fea_imp = pd.DataFrame({'feature':train_features, 'imp':lgb_model_3_order.feature_importance()}).sort_values('imp')
important_features = fea_imp.loc[fea_imp.imp >=1, 'feature'].values
important_features = list(important_features)

important_features.append('file_id')
important_features.append('label')

train_data_[important_features].to_csv('./train_data_2gram.csv',index = None)
 

# 附录
tf-idf的1Gram特征可以替换api的次数特征等，加入tf-idf有提升，提升较小