In [27]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb  

In [11]:
train = pd.read_csv('../input/mydata-a/train_featureV5.csv')
test = pd.read_csv('../input/mydata-a/test_featureV5.csv')

In [12]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [13]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 48,
    'num_leaves': 128,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [14]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.28 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 调参

In [20]:
print("调参1：提高准确率")
for num_leaves in range(20,200,5):
    lgb_params['num_leaves'] = num_leaves
    cv_results = lgb.cv(
                            lgb_params,
                            dtrain,
                            nfold=3,
                            early_stopping_rounds=100,
                            num_boost_round=10000,
                            verbose_eval=5,
                            metrics=['binary_error']
                        )
    mean_merror = pd.Series(cv_results['binary_error-mean']).min()
    boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()

    if mean_merror < min_merror:
            min_merror = mean_merror
            best_params['num_leaves'] = num_leaves

lgb_params['num_leaves'] = best_params['num_leaves']

调参1：提高准确率
[5]	cv_agg's binary_error: 0.139229 + 0.00301919
[10]	cv_agg's binary_error: 0.14023 + 0.00720934
[15]	cv_agg's binary_error: 0.14083 + 0.00619732
[20]	cv_agg's binary_error: 0.138429 + 0.0073879
[25]	cv_agg's binary_error: 0.135629 + 0.00710249
[30]	cv_agg's binary_error: 0.133228 + 0.00568418
[35]	cv_agg's binary_error: 0.132428 + 0.00655977
[40]	cv_agg's binary_error: 0.129628 + 0.00605621
[45]	cv_agg's binary_error: 0.128227 + 0.00575449
[50]	cv_agg's binary_error: 0.128428 + 0.00791955
[55]	cv_agg's binary_error: 0.129828 + 0.00746968
[60]	cv_agg's binary_error: 0.129028 + 0.0073518
[65]	cv_agg's binary_error: 0.129828 + 0.00577733
[70]	cv_agg's binary_error: 0.128428 + 0.00813064
[75]	cv_agg's binary_error: 0.128228 + 0.00938672
[80]	cv_agg's binary_error: 0.127828 + 0.0099433
[85]	cv_agg's binary_error: 0.128228 + 0.00896519
[90]	cv_agg's binary_error: 0.127428 + 0.0102333
[95]	cv_agg's binary_error: 0.127828 + 0.0100395
[100]	cv_agg's binary_error: 0.129229 + 0.010501

KeyboardInterrupt: 

In [22]:
print("调参2：降低过拟")
for min_data_in_leaf in range(10,200,5):       
    lgb_params['min_data_in_leaf'] = min_data_in_leaf

    
    cv_results = lgb.cv(
                            lgb_params,
                            dtrain,
                            nfold=3,
                            early_stopping_rounds=100,
                            num_boost_round=10000,
                            verbose_eval=5,
                            metrics=['binary_error']
                            )

    
    mean_merror = pd.Series(cv_results['binary_error-mean']).min()
    
    boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()

    
    if mean_merror < min_merror:
        
        min_merror = mean_merror
        
        best_params['min_data_in_leaf'] = min_data_in_leaf

lgb_params['min_data_in_leaf'] = best_params['min_data_in_leaf']

调参2：降低过拟
[5]	cv_agg's binary_error: 0.142629 + 0.00232864
[10]	cv_agg's binary_error: 0.139828 + 0.00300343
[15]	cv_agg's binary_error: 0.140828 + 0.00316522
[20]	cv_agg's binary_error: 0.141829 + 0.00302427
[25]	cv_agg's binary_error: 0.14043 + 0.00640837
[30]	cv_agg's binary_error: 0.138429 + 0.00479559
[35]	cv_agg's binary_error: 0.136029 + 0.00527491
[40]	cv_agg's binary_error: 0.138229 + 0.00402882
[45]	cv_agg's binary_error: 0.137229 + 0.00560896
[50]	cv_agg's binary_error: 0.136028 + 0.00448101
[55]	cv_agg's binary_error: 0.134828 + 0.00445047
[60]	cv_agg's binary_error: 0.133828 + 0.00392568
[65]	cv_agg's binary_error: 0.134828 + 0.00302951
[70]	cv_agg's binary_error: 0.134028 + 0.00399771
[75]	cv_agg's binary_error: 0.135629 + 0.00515289
[80]	cv_agg's binary_error: 0.133228 + 0.00605721


KeyboardInterrupt: 

### 本地CV

In [13]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

## 训练

In [14]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

## XGB

In [25]:
params = {
    'booster': 'gbtree',
    # 'objective': 'multi:softmax',  # 多分类的问题、
    # 'objective': 'multi:softprob',   # 多分类概率
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    # 'num_class': 9,  # 类别数，与 multisoftmax 并用
    'gamma': 0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 8,  # 构建树的深度，越大越容易过拟合
    'alpha': 0,   # L1正则化系数
    'lambda': 10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,  # 随机采样训练样本
    'colsample_bytree': 0.5,  # 生成树时进行的列采样
    'min_child_weight': 3,
    # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
    # ，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    # 这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    'silent': 0,  # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.03,  # 如同学习率
    'seed': 1000,
    'nthread': -1,  # cpu 线程数
    'missing': 1,
#     'scale_pos_weight': (np.sum(y==0)/np.sum(y==1))  # 用来处理正负样本不均衡的问题,通常取：sum(negative cases) / sum(positive cases)
    # 'eval_metric': 'auc'
}

In [28]:
# dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
# dtest = lgb.Dataset(test.drop(['uid'],axis=1))
xgb_train = xgb.DMatrix(train.drop(['uid','label'],axis=1), label=train.label)  
xgb_test = xgb.DMatrix(test.drop(['uid'],axis=1))  

In [31]:
xgb_cv = xgb.cv(params,xgb_train, num_boost_round=200, nfold=4, early_stopping_rounds=200, verbose_eval=True)  

[0]	train-logloss:0.677196+0.000184424	test-logloss:0.678071+0.000213585
[1]	train-logloss:0.661992+0.000698791	test-logloss:0.663944+0.000393933
[2]	train-logloss:0.647152+0.000836595	test-logloss:0.650181+0.000562799
[3]	train-logloss:0.63308+0.00140697	test-logloss:0.636991+0.000646776
[4]	train-logloss:0.620178+0.00146467	test-logloss:0.624771+0.000261667
[5]	train-logloss:0.607683+0.00188149	test-logloss:0.612988+0.000714842
[6]	train-logloss:0.595636+0.0024642	test-logloss:0.601864+0.000669978
[7]	train-logloss:0.584355+0.00271352	test-logloss:0.591513+0.000553767
[8]	train-logloss:0.573418+0.00271895	test-logloss:0.581444+0.00100247
[9]	train-logloss:0.562688+0.003035	test-logloss:0.571832+0.00129969
[10]	train-logloss:0.552752+0.00246128	test-logloss:0.562687+0.00156511
[11]	train-logloss:0.542906+0.00267884	test-logloss:0.553708+0.00175801
[12]	train-logloss:0.533721+0.00284291	test-logloss:0.545276+0.0020028
[13]	train-logloss:0.52421+0.00269459	test-logloss:0.536525+0.002512

KeyboardInterrupt: 

In [34]:
model_xgb = xgb.train(params,xgb_train,num_boost_round=10,evals=(),obj=None,feval=None,maximize=False,early_stopping_rounds=None,
evals_result=None,verbose_eval=True,learning_rates=None,xgb_model=None)


In [None]:
preds_xgb = model_xgb.predict(xgb_test)  

### LGB预测

In [15]:
pred=model.predict(test.drop(['uid'],axis=1))

In [16]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [17]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.28 else 0)
res.label = res.label.map(lambda x: int(x))

In [19]:
res.to_csv('fianal_answer.csv',index=False,header=False,sep=',',columns=['uid','label'])