In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [113]:
train = pd.read_csv('../data/baseline_train_featureV1.csv')
test = pd.read_csv('../data/baseline_test_featureV1.csv')

In [114]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [115]:
uid_test_b = pd.read_csv('../data/uid_test_b.txt')

In [116]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [117]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [118]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.744885 + 0.012333
[10]	cv_agg's res: 0.759769 + 0.0067687
[15]	cv_agg's res: 0.766704 + 0.00747624
[20]	cv_agg's res: 0.765361 + 0.00853749
[25]	cv_agg's res: 0.769359 + 0.00709288
[30]	cv_agg's res: 0.771683 + 0.00467434
[35]	cv_agg's res: 0.773135 + 0.00654911
[40]	cv_agg's res: 0.776656 + 0.00423966
[45]	cv_agg's res: 0.778561 + 0.0045154
[50]	cv_agg's res: 0.781046 + 0.00953071
[55]	cv_agg's res: 0.781835 + 0.00738672
[60]	cv_agg's res: 0.780516 + 0.00795074
[65]	cv_agg's res: 0.782884 + 0.00807411
[70]	cv_agg's res: 0.781147 + 0.00859915
[75]	cv_agg's res: 0.780524 + 0.00947809
[80]	cv_agg's res: 0.781529 + 0.00810055
[85]	cv_agg's res: 0.783109 + 0.00795781
[90]	cv_agg's res: 0.782494 + 0.00871028
[95]	cv_agg's res: 0.783634 + 0.0122432
[100]	cv_agg's res: 0.782396 + 0.0124409
[105]	cv_agg's res: 0.782033 + 0.0114194
[110]	cv_agg's res: 0.781797 + 0.0109774
[115]	cv_agg's res: 0.780908 + 0.00806153
[120]	cv_agg's res: 0.781195 + 0.00939188
[125]	cv_agg's res: 

{'res-mean': [0.6742108586301144,
  0.700658708839477,
  0.7384679980472889,
  0.7329797422962391,
  0.7448845511724874,
  0.7445184631328122,
  0.7559760649156368,
  0.7599102698380867,
  0.7614301966532766,
  0.7597686994539211,
  0.7602087166430712,
  0.7640452018587104,
  0.7634999210587378,
  0.7649816078447286,
  0.7667035746110799,
  0.7691029214136504,
  0.769836046639452,
  0.7658998657006482,
  0.7657510396176505,
  0.76536078214668,
  0.7655030393865353,
  0.7658191102750095,
  0.7697585100128178,
  0.7688008585933379,
  0.7693589662342312,
  0.7697896650082541,
  0.7668672334825849,
  0.7683482678543093,
  0.7705991425428594,
  0.7716832149356323,
  0.7719610391768884,
  0.7677380304585016,
  0.7709726532267119,
  0.7721884676827081,
  0.7731354185395397,
  0.7743260207646466,
  0.7745480745101481,
  0.7759498424779406,
  0.7757026888819979,
  0.7766558009713028,
  0.7772583748517862,
  0.777509759134801,
  0.7783554879298796,
  0.7775503267082601,
  0.7785611461882255,
  0

## 训练

In [8]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.889311
[10]	training's res: 0.911326
[15]	training's res: 0.926073
[20]	training's res: 0.939491
[25]	training's res: 0.952701
[30]	training's res: 0.963059
[35]	training's res: 0.974133
[40]	training's res: 0.981368
[45]	training's res: 0.986526
[50]	training's res: 0.991306
[55]	training's res: 0.995715
[60]	training's res: 0.997539
[65]	training's res: 0.998886
[70]	training's res: 0.999778
[75]	training's res: 1
[80]	training's res: 1
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[195]	trai

### 预测

In [9]:
pred=model.predict(test.drop(['uid'],axis=1))

In [10]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [11]:
res_b = pd.merge(uid_test_b, res, how='left',on='uid')
res_b=res_b.sort_values(by='label',ascending=False).reset_index()

In [12]:
a = []
for i in range(3000):
    a.append(0)
for i in range(int(3000*900/4999)):
    a[i] = 1
res_label = pd.DataFrame(a)

In [13]:
res_b['label'] = res_label

In [14]:
res_b = res_b.drop('index', axis=1)

In [15]:
res_b.to_csv('lgb-baseline-fusai-0602.csv',index=False,header=False,sep=',',columns=['uid','label'])