In [1]:
from __future__ import division

import numpy as np
import pandas as pd 
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,OneHotEncoder
import warnings
warnings.filterwarnings('ignore')
import gc
from scipy import sparse

In [76]:
df_train = pd.read_csv('./data/gbdt-lr/train.csv')
df_test = pd.read_csv('./data/gbdt-lr/test.csv')
df_train.drop(['Id'], axis = 1, inplace = True)
df_test.drop(['Id'], axis = 1, inplace = True)
df_test['Label']=-1

In [77]:
data = pd.concat([df_train, df_test],sort=True)
# data = data.fillna(-1)

In [78]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

In [79]:
data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )

In [80]:
data.columns

Index(['C1', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C2', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C3',
       'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'I1', 'I10', 'I11', 'I12', 'I13',
       'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'Label'],
      dtype='object')

In [82]:
data

Unnamed: 0,C1,C10,C11,C12,C13,C14,C15,C16,C17,C18,...,I13,I2,I3,I4,I5,I6,I7,I8,I9,Label
0,75ac2fe6,9e5006cd,4d8549da,a48afad2,51b97b8f,b28479f6,d345b1a0,3fa658c5,3486227d,e88ffc9d,...,0.0,0,1.0,0.0,227.0,1.0,173.0,18.0,50.0,1
1,05db9164,e4b08fda,4ab39743,be45b877,ab8a1a53,07d13a8f,06969a20,9bc7fff5,07c540c4,92555263,...,2.0,1,1.0,2.0,27.0,2.0,4.0,2.0,2.0,1
2,05db9164,3013a9ec,371dae82,378b4833,18fc2b1e,cfef1c29,dad721df,1f2e9dec,07c540c4,25c88e42,...,0.0,806,0.0,0.0,1752.0,142.0,2.0,0.0,50.0,1
3,05db9164,0a3a2cb6,a0a5e9d7,fbe49065,ee79db7b,07d13a8f,36721ddc,fc60350c,e5ba7672,5aed7436,...,38.0,-1,42.0,14.0,302.0,38.0,25.0,38.0,90.0,0
4,05db9164,3b08e48b,418037d7,cbb5e0eb,b0bfed6d,07d13a8f,3b2d8705,74d50e5e,e5ba7672,642f2610,...,1.0,57,2.0,1.0,2891.0,2.0,35.0,1.0,137.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,be589b51,3b08e48b,661c2800,9449c78e,38087489,07d13a8f,36721ddc,5fed0876,d4bb7bd8,5aed7436,...,0.0,0,1.0,0.0,149.0,5.0,1.0,0.0,0.0,-1
396,05db9164,3b08e48b,f1b78ab4,0826f297,6e5da64f,1adce6ef,4903dd2e,0abe22ad,2005abd1,5162930e,...,0.0,-1,0.0,0.0,0.0,0.0,0.0,0.0,6.0,-1
397,68fd1e64,6c47047a,606866a9,21a23bfe,e40e52ae,07d13a8f,e3209fc2,587267a3,8efede7f,a78bd508,...,0.0,300,4.0,0.0,4622.0,25.0,20.0,6.0,55.0,-1
398,05db9164,fa7d0797,9163f8f1,eac9feed,b5b29c1f,1adce6ef,7e7dc5e4,98a54621,d4bb7bd8,a1d0cc4f,...,1.0,1,2.0,1.0,5.0,1.0,1.0,1.0,1.0,-1


# 训练方式1：
直接套lgb.Dataset？还是得LabelEncoder转码

In [83]:
lgb_train=lgb.Dataset(data[sparse_features], data['Label'])

In [None]:

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 64,
    'num_trees': 20,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train)



# 训练方式2：
转码后再训练

In [7]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [8]:
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [9]:
train = data[data['Label'] != -1]
target = train.pop('Label')
test = data[data['Label'] == -1]
test.drop(['Label'], axis = 1, inplace = True)

In [10]:
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)

In [40]:
gbm = lgb.LGBMRegressor(objective='binary',
                            subsample= 0.8,
                            min_child_weight= 0.5,
                            colsample_bytree= 0.7,
                            num_leaves=64,
                            num_trees=21,
                            max_depth = 12,
                            learning_rate=0.05,
                            n_estimators=10,
                            )

gbm.fit(x_train, y_train,
        eval_set = [(x_train, y_train), (x_val, y_val)],
        eval_names = ['train', 'val'],
        eval_metric = 'binary_logloss',
        early_stopping_rounds = 100,
        )
model = gbm.booster_

[1]	train's binary_logloss: 0.486578	val's binary_logloss: 0.560972
Training until validation scores don't improve for 100 rounds
[2]	train's binary_logloss: 0.47495	val's binary_logloss: 0.559015
[3]	train's binary_logloss: 0.462347	val's binary_logloss: 0.555928
[4]	train's binary_logloss: 0.45245	val's binary_logloss: 0.551361
[5]	train's binary_logloss: 0.441314	val's binary_logloss: 0.550049
[6]	train's binary_logloss: 0.43139	val's binary_logloss: 0.548752
[7]	train's binary_logloss: 0.422372	val's binary_logloss: 0.547353
[8]	train's binary_logloss: 0.413541	val's binary_logloss: 0.546667
[9]	train's binary_logloss: 0.404651	val's binary_logloss: 0.543713
[10]	train's binary_logloss: 0.396452	val's binary_logloss: 0.543111
[11]	train's binary_logloss: 0.387899	val's binary_logloss: 0.540684
[12]	train's binary_logloss: 0.379921	val's binary_logloss: 0.539786
[13]	train's binary_logloss: 0.372784	val's binary_logloss: 0.539262
[14]	train's binary_logloss: 0.3663	val's binary_logl

In [41]:
gbdt_feats_train = model.predict(train, pred_leaf = True)
gbdt_feats_test = model.predict(test, pred_leaf = True)
gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]
df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name) 
df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)

In [42]:
df_test_gbdt_feats.shape

(400, 21)

意思是，df中的数据是对应子树的第几个叶子

In [43]:
sparse_data=pd.concat([df_train_gbdt_feats,df_test_gbdt_feats],axis=0)

In [45]:
sparse_data.shape

(1999, 21)

In [46]:
for col in gbdt_feats_name:
    print('this is feature:', col)
    onehot_feats = pd.get_dummies(sparse_data[col], prefix = col)
    sparse_data.drop([col], axis = 1, inplace = True)
    sparse_data = pd.concat([sparse_data, onehot_feats], axis = 1)

this is feature: gbdt_leaf_0
this is feature: gbdt_leaf_1
this is feature: gbdt_leaf_2
this is feature: gbdt_leaf_3
this is feature: gbdt_leaf_4
this is feature: gbdt_leaf_5
this is feature: gbdt_leaf_6
this is feature: gbdt_leaf_7
this is feature: gbdt_leaf_8
this is feature: gbdt_leaf_9
this is feature: gbdt_leaf_10
this is feature: gbdt_leaf_11
this is feature: gbdt_leaf_12
this is feature: gbdt_leaf_13
this is feature: gbdt_leaf_14
this is feature: gbdt_leaf_15
this is feature: gbdt_leaf_16
this is feature: gbdt_leaf_17
this is feature: gbdt_leaf_18
this is feature: gbdt_leaf_19
this is feature: gbdt_leaf_20


In [48]:
sparse_data.head()

Unnamed: 0,gbdt_leaf_0_0,gbdt_leaf_0_1,gbdt_leaf_0_2,gbdt_leaf_0_3,gbdt_leaf_0_4,gbdt_leaf_0_5,gbdt_leaf_0_6,gbdt_leaf_0_7,gbdt_leaf_0_8,gbdt_leaf_0_9,...,gbdt_leaf_20_38,gbdt_leaf_20_39,gbdt_leaf_20_40,gbdt_leaf_20_41,gbdt_leaf_20_42,gbdt_leaf_20_43,gbdt_leaf_20_44,gbdt_leaf_20_45,gbdt_leaf_20_46,gbdt_leaf_20_47
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
train=sparse_data[:-400]
test=sparse_data[-400:]
del sparse_data

In [52]:
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2018)
print('开始训练lr..')
lr = LogisticRegression()
lr.fit(x_train, y_train)
tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
print('tr-logloss: ', tr_logloss)
val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])
print('val-logloss: ', val_logloss)
print('开始预测...')
y_pred = lr.predict_proba(test)[:, 1]
print('写入结果...')
res = pd.read_csv('data/gbdt-lr/test.csv')
submission = pd.DataFrame({'Id': res['Id'], 'Label': y_pred})
submission.to_csv('./data/submission_gbdt+lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)
print('结束')

开始训练lr..
tr-logloss:  0.03612739053413353
val-logloss:  0.5408154189636616
开始预测...
写入结果...
结束


# 加上dense特征

In [62]:
train_dense=data[dense_features].iloc[:-400]
test_dense=data[dense_features].iloc[-400:]

train=pd.concat([train,train_dense],axis=1)
test=pd.concat([test,test_dense],axis=1)

In [65]:
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2018)
print('开始训练lr..')
lr = LogisticRegression()
lr.fit(x_train, y_train)
tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
print('tr-logloss: ', tr_logloss)
val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])
print('val-logloss: ', val_logloss)
print('开始预测...')
y_pred = lr.predict_proba(test)[:, 1]
print('写入结果...')
res = pd.read_csv('data/gbdt-lr/test.csv')
submission = pd.DataFrame({'Id': res['Id'], 'Label': y_pred})
submission.to_csv('./data/submission_dense_gbdt+lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)
print('结束')

开始训练lr..
tr-logloss:  0.0360974097226158
val-logloss:  0.5405212516762375
开始预测...
写入结果...
结束
