导入所需库

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
warnings.filterwarnings('ignore')

减少内存中使用数据的方法

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

读取数据集并优化内存

In [3]:
data_train = pd.read_csv('clean_train.csv')
data_train = reduce_mem_usage(data_train)
data_test_a = pd.read_csv('clean_test.csv')
data_test_a = reduce_mem_usage(data_test_a)

Mem. usage decreased to 77.39 Mb (73.2% reduction)
Mem. usage decreased to 19.46 Mb (72.9% reduction)


去除不需要的特征

In [4]:
features = [f for f in data_train.columns if f not in ['id','issueDate','isDefault']]
train_x = data_train[features]
test_x = data_test_a[features]
train_y = data_train['isDefault']

贝叶斯优化进行调参

将data_train划分为训练集和验证集

In [5]:
bayes_trn_index, bayes_val_index = list(StratifiedKFold(n_splits=2, shuffle=True, random_state=1).split(train_x, train_y))[0]

定义目标函数

In [6]:
def LGB_bayesian(num_leaves,
                 max_depth,
                 max_bin,
                 bagging_fraction,
                 bagging_freq,
                 feature_fraction,
                 min_data_in_leaf,
                 min_child_weight,
                 min_split_gain,
                 min_child_samples,
                 lambda_l2):
    
    trn_x, trn_y, val_x, val_y = train_x.iloc[bayes_trn_index], train_y[bayes_trn_index],train_x.iloc[bayes_val_index], train_y[bayes_val_index]
    lgb_train = lgb.Dataset(trn_x, label=trn_y)
    lgb_valid = lgb.Dataset(val_x, label=val_y)
    
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc', 
        'num_leaves': int(num_leaves), 
        'max_depth': int(max_depth), 
        'max_bin': int(max_bin), 
        'bagging_fraction': round(bagging_fraction, 2), 
        'bagging_freq': int(bagging_freq), 
        'feature_fraction': round(feature_fraction, 2),
        'min_data_in_leaf': int(min_data_in_leaf),
        'min_split_gain': min_split_gain, 
        'min_child_samples': int(min_child_samples), 
        'min_child_weight': min_child_weight, 
        'lambda_l2': lambda_l2, 
        'n_jobs': 8,
        'learning_rate': 0.01,
        'verbosity': -1, 
        }
    
    num_round = 10000
    model = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_valid], verbose_eval=200, early_stopping_rounds=200)
    pred = model.predict(val_x, num_iteration=model.best_iteration)
    score = roc_auc_score(val_y, pred)
    
    return score

定义参数范围

In [7]:
bounds_LGB = {
    'num_leaves': (30, 150), 
    'max_depth': (3, 20), 
    'max_bin': (30, 80), 
    'bagging_fraction': (0.5, 1.0), 
    'bagging_freq': (1, 50), 
    'feature_fraction': (0.5, 1.0), 
    'min_data_in_leaf':(30,150),
    'min_split_gain': (0.0, 1.0), 
    'min_child_samples': (25, 125), 
    'min_child_weight': (0.0, 10), 
    'lambda_l2': (0.0,10.0)
}

使用贝叶斯优化

In [8]:
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=13)
init_points = 5
n_iter = 10
LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

|   iter    |  target   | baggin... | baggin... | featur... | lambda_l2 |  max_bin  | max_depth | min_ch... | min_ch... | min_da... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 200 rounds
[200]	valid_0's auc: 0.719887
[400]	valid_0's auc: 0.726103
[600]	valid_0's auc: 0.729308
[800]	valid_0's auc: 0.731264
[1000]	valid_0's auc: 0.732413
[1200]	valid_0's auc: 0.732996
[1400]	valid_0's auc: 0.733553
[1600]	valid_0's auc: 0.733802
[1800]	valid_0's auc: 0.73408
[2000]	valid_0's auc: 0.734409
[2200]	valid_0's auc: 0.734579
[2400]	valid_0's auc: 0.734745
[2600]	valid_0's auc: 0.73485
[2800]	valid_0's auc: 0.734983
[3000]	valid_0's auc: 0.735013
[3200]	valid_0's auc: 0.735109
[3400]	valid_0's auc: 0.735173
[3600]	valid_0's auc: 0.73527
[3800]	valid_0's auc: 0.73535
[4000]	valid_0's auc: 0.735363
[4200]	val

[2800]	valid_0's auc: 0.734851
Early stopping, best iteration is:
[2705]	valid_0's auc: 0.734893
| [0m 8       [0m | [0m 0.7349  [0m | [0m 0.6992  [0m | [0m 15.03   [0m | [0m 0.9706  [0m | [0m 4.516   [0m | [0m 61.46   [0m | [0m 14.44   [0m | [0m 85.05   [0m | [0m 7.187   [0m | [0m 48.73   [0m | [0m 0.06048 [0m | [0m 56.38   [0m |
Training until validation scores don't improve for 200 rounds
[200]	valid_0's auc: 0.724967
[400]	valid_0's auc: 0.729208
[600]	valid_0's auc: 0.731658
[800]	valid_0's auc: 0.733148
[1000]	valid_0's auc: 0.733965
[1200]	valid_0's auc: 0.734433
[1400]	valid_0's auc: 0.734693
[1600]	valid_0's auc: 0.734874
[1800]	valid_0's auc: 0.734957
[2000]	valid_0's auc: 0.735027
[2200]	valid_0's auc: 0.735062
[2400]	valid_0's auc: 0.735131
[2600]	valid_0's auc: 0.735066
Early stopping, best iteration is:
[2405]	valid_0's auc: 0.735137
| [0m 9       [0m | [0m 0.7351  [0m | [0m 0.7482  [0m | [0m 13.33   [0m | [0m 0.5887  [0m | [0m 1.375  

获取最佳参数

In [9]:
LGB_BO.max['params']

{'bagging_fraction': 0.7228117783307912,
 'bagging_freq': 12.846410699432155,
 'feature_fraction': 0.7031745228966835,
 'lambda_l2': 7.902405154563362,
 'max_bin': 74.07314172395758,
 'max_depth': 7.744560295583925,
 'min_child_samples': 62.04171900452664,
 'min_child_weight': 8.146764474600342,
 'min_data_in_leaf': 81.86903052339719,
 'min_split_gain': 0.14382617412439602,
 'num_leaves': 105.49873874462759}

采用lightgbm建模，k折交叉验证来评估模型

In [10]:
folds = 10
seed = 2020
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])

for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
    print('************************************ {} ************************************'.format(str(i+1)))
    trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
    
    train_matrix = lgb.Dataset(trn_x, label=trn_y)
    valid_matrix = lgb.Dataset(val_x, label=val_y)
    
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': int(LGB_BO.max['params']['num_leaves']), 
        'max_depth': int(LGB_BO.max['params']['max_depth']), 
        'max_bin': int(LGB_BO.max['params']['max_bin']), 
        'bagging_fraction': LGB_BO.max['params']['bagging_fraction'],
        'bagging_freq': int(LGB_BO.max['params']['bagging_freq']), 
        'feature_fraction': LGB_BO.max['params']['feature_fraction'],
        'min_data_in_leaf': int(LGB_BO.max['params']['min_data_in_leaf']),
        'min_split_gain': LGB_BO.max['params']['min_split_gain'], 
        'min_child_samples': int(LGB_BO.max['params']['min_child_samples']), 
        'min_child_weight': LGB_BO.max['params']['min_child_weight'], 
        'lambda_l2': LGB_BO.max['params']['lambda_l2'], 
        'learning_rate': 0.01,
        'seed': 2020,
        'nthread': 28,
        'n_jobs':24,
        'verbose': -1,
        'silent': True
    }
    
    num_round = 10000
    model = lgb.train(params, train_matrix, num_round, valid_sets=[valid_matrix], verbose_eval=200,early_stopping_rounds=200)
    val_pred = model.predict(val_x, num_iteration=model.best_iteration)
    test_pred = model.predict(test_x, num_iteration=model.best_iteration)
    
    train[valid_index] = val_pred
    test = test_pred / kf.n_splits
    score = roc_auc_score(val_y, val_pred)

    print(score)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	valid_0's auc: 0.722027
[400]	valid_0's auc: 0.726903
[600]	valid_0's auc: 0.729765
[800]	valid_0's auc: 0.73159
[1000]	valid_0's auc: 0.733012
[1200]	valid_0's auc: 0.733979
[1400]	valid_0's auc: 0.734575
[1600]	valid_0's auc: 0.735219
[1800]	valid_0's auc: 0.735638
[2000]	valid_0's auc: 0.735995
[2200]	valid_0's auc: 0.736304
[2400]	valid_0's auc: 0.736531
[2600]	valid_0's auc: 0.736791
[2800]	valid_0's auc: 0.737042
[3000]	valid_0's auc: 0.737217
[3200]	valid_0's auc: 0.737327
[3400]	valid_0's auc: 0.737435
[3600]	valid_0's auc: 0.737585
[3800]	valid_0's auc: 0.737679
[4000]	valid_0's auc: 0.737717
[4200]	valid_0's auc: 0.737695
[4400]	valid_0's auc: 0.737752
[4600]	valid_0's auc: 0.73775
[4800]	valid_0's auc: 0.737839
[5000]	valid_0's auc: 0.737893
[5200]	valid_0's auc: 0.737903
[5400]	valid_0's auc: 0.737964
[5600]	valid_0's auc: 0.737993


0.7377973761699876
************************************ 10 ************************************
Training until validation scores don't improve for 200 rounds
[200]	valid_0's auc: 0.722192
[400]	valid_0's auc: 0.726759
[600]	valid_0's auc: 0.729255
[800]	valid_0's auc: 0.731126
[1000]	valid_0's auc: 0.732328
[1200]	valid_0's auc: 0.733153
[1400]	valid_0's auc: 0.733735
[1600]	valid_0's auc: 0.734251
[1800]	valid_0's auc: 0.734688
[2000]	valid_0's auc: 0.734963
[2200]	valid_0's auc: 0.735266
[2400]	valid_0's auc: 0.735463
[2600]	valid_0's auc: 0.735594
[2800]	valid_0's auc: 0.735645
[3000]	valid_0's auc: 0.735725
[3200]	valid_0's auc: 0.73587
[3400]	valid_0's auc: 0.735887
[3600]	valid_0's auc: 0.735977
[3800]	valid_0's auc: 0.735992
[4000]	valid_0's auc: 0.736006
[4200]	valid_0's auc: 0.736036
[4400]	valid_0's auc: 0.736071
[4600]	valid_0's auc: 0.736012
Early stopping, best iteration is:
[4452]	valid_0's auc: 0.736092
0.7360919582991823


In [11]:
result = pd.DataFrame({'id': data_test_a['id'], 'isDefault': test})
result.head()

Unnamed: 0,id,isDefault
0,800000,0.00739
1,800001,0.030056
2,800002,0.064912
3,800003,0.031859
4,800004,0.035386


In [12]:
result.to_csv('result.csv', index=0)