In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('train_feature_b_V1.csv')
test = pd.read_csv('test_feature_b_V1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),
                     label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [10]:
lgb_params =  {
    'max_bin':10,
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 16,
    'num_leaves': 256,
    'learning_rate': 0.04,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
    #'lambda_l1': 0.001,
    'skip_drop': 0.95,
    # max_drop 改小后，结果变好
    'max_drop' : 8,
    'lambda_l2': 0.005

    #'num_threads': 18
}

LightGBM 使用 leaf-wise 的树生长策略, 而很多其他流行的算法采用 depth-wise 的树生长策略.   
与 depth-wise 的树生长策略相较, leaf-wise 算法可以收敛的更快.    
但是, 如果参数选择不当的话, leaf-wise 算法有可能导致过拟合.  

http://lightgbm.apachecn.org/cn/latest/Parameters-Tuning.html#leaf-wise


num_leaves. 这是控制树模型复杂度的主要参数. 理论上, 借鉴 depth-wise 树, 我们可以设置 num_leaves = 2^(max_depth) 但是, 这种简单的转化在实际应用中表现不佳. 这是因为, 当叶子数目相同时, leaf-wise 树要比 depth-wise 树深得多, 这就有可能导致过拟合. 因此, 当我们试着调整 num_leaves 的取值时, 应该让其小于 2^(max_depth). 举个例子, 当 max_depth=6 时(这里译者认为例子中, 树的最大深度应为7), depth-wise 树可以达到较高的准确率.但是如果设置 num_leaves 为 127 时, 有可能会导致过拟合, 而将其设置为 70 或 80 时可能会得到比 depth-wise 树更高的准确率. 其实, depth 的概念在 leaf-wise 树中并没有多大作用, 因为并不存在一个从 leaves 到 depth 的合理映射.

min_data_in_leaf. 这是处理 leaf-wise 树的过拟合问题中一个非常重要的参数. 它的值取决于训练数据的样本个树和 num_leaves. 将其设置的较大可以避免生成一个过深的树, 但有可能导致欠拟合. 实际应用中, 对于大数据集, 设置其为几百或几千就足够了.



## 本地 CV

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True

In [13]:
lgb.cv(lgb_params,
       dtrain,
       feval=evalMetric,
       early_stopping_rounds=100,
       verbose_eval=5,
       num_boost_round=500,
       nfold=3,
       metrics=['evalMetric'])


[5]	cv_agg's res: 0.741994 + 0.00997265
[10]	cv_agg's res: 0.74867 + 0.00935533
[15]	cv_agg's res: 0.752296 + 0.0088537
[20]	cv_agg's res: 0.750784 + 0.00697227
[25]	cv_agg's res: 0.757434 + 0.00733749
[30]	cv_agg's res: 0.758333 + 0.0098986
[35]	cv_agg's res: 0.757776 + 0.00602476
[40]	cv_agg's res: 0.758035 + 0.00555178
[45]	cv_agg's res: 0.760469 + 0.00693763
[50]	cv_agg's res: 0.763328 + 0.00573268
[55]	cv_agg's res: 0.767601 + 0.00840056
[60]	cv_agg's res: 0.768559 + 0.00926277
[65]	cv_agg's res: 0.769856 + 0.00996407
[70]	cv_agg's res: 0.770707 + 0.0111501
[75]	cv_agg's res: 0.771366 + 0.0110921
[80]	cv_agg's res: 0.772848 + 0.0104222
[85]	cv_agg's res: 0.774597 + 0.00931024
[90]	cv_agg's res: 0.774319 + 0.00873744
[95]	cv_agg's res: 0.773952 + 0.00859013
[100]	cv_agg's res: 0.773842 + 0.00969116
[105]	cv_agg's res: 0.774777 + 0.0112883
[110]	cv_agg's res: 0.772067 + 0.0103625
[115]	cv_agg's res: 0.772607 + 0.00959932
[120]	cv_agg's res: 0.775898 + 0.0107378
[125]	cv_agg's res: 0

{'res-mean': [0.7347819956501654,
  0.7408059057564426,
  0.7376535919002555,
  0.7394114305579377,
  0.7419936580784926,
  0.7456802642453226,
  0.7489591076341009,
  0.7497662661528685,
  0.7500842654260126,
  0.7486703340004114,
  0.7487834771786627,
  0.7512824652281008,
  0.7505891671627946,
  0.7518017910724932,
  0.7522956707446582,
  0.7496138493500505,
  0.7502712481130546,
  0.7524813555300286,
  0.7543234725979312,
  0.7507838445683813,
  0.7542791856136747,
  0.7556441976019806,
  0.7570891159753167,
  0.7564432365886273,
  0.7574343760623347,
  0.7582751161570149,
  0.7566461599112936,
  0.7592739045646212,
  0.7599929376453445,
  0.7583327346507254,
  0.7597162764668107,
  0.7608125983806039,
  0.7593148578059296,
  0.7616029000454446,
  0.7577760424699514,
  0.7595541476187652,
  0.7598927206085492,
  0.7596064302557308,
  0.7601347118990126,
  0.7580350902465126,
  0.7609240447027101,
  0.7607401257738382,
  0.7619807893296121,
  0.760848660213253,
  0.7604689393042356,

## 训练

In [12]:
model =lgb.train(lgb_params,
                 dtrain,
                 feval=evalMetric,
                 verbose_eval=5,
                 num_boost_round = 500,
                 valid_sets=[dtrain])

[5]	training's res: 0.875228
[10]	training's res: 0.89033
[15]	training's res: 0.902898
[20]	training's res: 0.911706
[25]	training's res: 0.919474
[30]	training's res: 0.926706
[35]	training's res: 0.935386
[40]	training's res: 0.940653
[45]	training's res: 0.947782
[50]	training's res: 0.953037
[55]	training's res: 0.960141
[60]	training's res: 0.966362
[65]	training's res: 0.972117
[70]	training's res: 0.976593
[75]	training's res: 0.979175
[80]	training's res: 0.984851
[85]	training's res: 0.987587
[90]	training's res: 0.990726
[95]	training's res: 0.991969
[100]	training's res: 0.9952
[105]	training's res: 0.996369
[110]	training's res: 0.997288
[115]	training's res: 0.9982
[120]	training's res: 0.998655
[125]	training's res: 0.99866
[130]	training's res: 0.999108
[135]	training's res: 0.999332
[140]	training's res: 0.999333
[145]	training's res: 0.999333
[150]	training's res: 0.999555
[155]	training's res: 0.999555
[160]	training's res: 1
[165]	training's res: 1
[170]	training's 

## 预测

In [None]:
pred=model.predict(test.drop(['uid'],axis=1))

In [None]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [None]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [None]:
res.to_csv('b-v2-last.csv',index=False,header=False,
           sep=',',columns=['uid','label'])
