In [1]:
"""
特征组合：Dict+GroupBy+nlp
特征选择方式：Wrapper
参数寻优办法：hyperopt
模型：lightgbm
"""
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe 
from numpy.random import RandomState
from sklearn.metrics import mean_squared_error

path = '/Volumes/U397/机器学习/机器学习比赛/机器学习算法竞赛实战/第8章 kaggle 信用卡忠诚度预测/'

# lgb+hyperopt 线下CV得分3.6679(相比baseline-0.0178）

In [20]:
def read_data(debug=True):
    """
    读取数据
    :param debug:是否调试版，可以极大节省debug时间
    :return:训练集，测试集
    """
    print("read_data...")
    NROWS = 10000 if debug else None
    train_dict = pd.read_csv(path+"preprocess/train_dict.csv", nrows=NROWS) #nrows=None是读取全部行
    test_dict = pd.read_csv(path+"preprocess/test_dict.csv", nrows=NROWS)
    train_groupby = pd.read_csv(path+"preprocess/train_groupby.csv", nrows=NROWS)
    test_groupby = pd.read_csv(path+"preprocess/test_groupby.csv", nrows=NROWS)

    # 去除重复列
    for co in train_dict.columns:
        if co in train_groupby.columns and co!='card_id':
            del train_groupby[co]
    for co in test_dict.columns:
        if co in test_groupby.columns and co!='card_id':
            del test_groupby[co]

    train = pd.merge(train_dict, train_groupby, how='left', on='card_id')
    test = pd.merge(test_dict, test_groupby, how='left', on='card_id')
    print("done")
    return train, test


def feature_select_wrapper(train, test):
    """
    按照lightgbm的feature_importance进行特征筛选
    :param train:
    :param test:
    :return:
    """
    print('feature_select_wrapper...')
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')

    # 配置模型的训练参数
    params_initial = {
        'num_leaves': 31,
        'learning_rate': 0.1,
        'boosting': 'gbdt',
        'min_child_samples': 20,
        'bagging_seed': 2020,
        'bagging_fraction': 0.7, #行采样
        'bagging_freq': 1,#重新采样的频率，每k次迭代后重新按照bagging_fraction比例进行行采样
        'feature_fraction': 0.7, #列采样
        'max_depth': -1,
        'metric': 'rmse',
        'reg_alpha': 0,
        'reg_lambda': 1,
        'objective': 'regression'
    }
    ESR = 30    #early_stop_round
    NBR = 10000 #num_boost_round
    VBE = 50    #verbose_eval
    callbacks = [lgb.log_evaluation(VBE),lgb.early_stopping(ESR)]
    
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    fse = pd.Series(0, index=features)
    for i,(train_part_index, val_index) in enumerate(kf.split(train[features], train[label])):
        # 模型训练
        train_part = lgb.Dataset(train[features].loc[train_part_index],
                                 train[label].loc[train_part_index])
        val_part = lgb.Dataset(train[features].loc[val_index],
                           train[label].loc[val_index])
        bst = lgb.train(params_initial, train_part, num_boost_round=NBR,
                        valid_sets=[train_part, val_part],
                        valid_names=['train', 'valid'],
                        callbacks=callbacks)
        '''
        early_stopping_rounds : int or None, optional (default=None)
            Activates early stopping. The model will train until the validation score stops improving.
        Validation score needs to improve at least every ``early_stopping_rounds`` round(s)
        to continue training.
            Requires at least one validation data and one metric.
        注意If there's more than one, will check all of them. But the training data is ignored anyway.
        To check only the first metric, set the ``first_metric_only`` parameter to ``True`` in ``params``.
        The index of iteration that has the best performance will be saved in the ``best_iteration`` field
        if early stopping logic is enabled by setting ``early_stopping_rounds``
        '''
        fse += pd.Series(bst.feature_importance(), features) #统计5折加总的feature_importance
        #print(pd.Series(bst.feature_importance(), features).head(10))
        #print(fse.head(10))

    feature_select = ['card_id'] + fse.sort_values(ascending=False).index.tolist()[:300]
    print('done')
    return train[feature_select + ['target']], test[feature_select]


In [14]:
def params_append(params):
    """
    默认情况下，在构造LightGBM数据集对象时，将基于min_data_in_leaf的值过滤掉某些特征。
    举一个简单的例子，考虑一个具有一个名为feature_1的特征的1000个观测数据集。 
    feature_1仅采用两个值：25.0（995个观测值）和50.0（5个观测值）。如果min_data_in_leaf = 10,则此特征没有拆分。
    在构建数据集之前，LightGBM不会在重新训练时重新考虑此特征并每次迭代时都忽略它，而是在训练之前将其过滤掉。
    可以通过设置feature_pre_filter = False来覆盖此默认行为，以免在hyperopt调参报错
    """
    params['feature_pre_filter'] = False
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['bagging_seed'] = 2020
    return params


def param_hyperopt(train):
    """
    返回最佳参数
    :param train:
    :return:
    """
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')
    train_data = lgb.Dataset(train[features], train[label], silent=True)
    def hyperopt_objective(params):#优化的目标函数
        """
        :param params:
        :return:
        """
        params = params_append(params)
        print(params)
        callbacks = [lgb.early_stopping(20)]
        res = lgb.cv(params, train_data, 1000,
                     nfold=2,
                     stratified=False,
                     shuffle=True,
                     metrics='rmse',
                     callbacks=callbacks,
                     show_stdv=False,
                     seed=2020)
        #result记录了每一颗树的eval_metric,因此rmse-mean中的最小值即对应最后的最优结果
        return min(res['rmse-mean'])#目标函数 可以自己选择 这里为什么是min？
    
    params_space = {
        'learning_rate': hp.uniform('learning_rate', 1e-2, 5e-1), #均匀分布
        'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
        'num_leaves': hp.choice('num_leaves', list(range(10, 300, 10))),
        'reg_alpha': hp.randint('reg_alpha', 0, 10),#随机整数
        'reg_lambda': hp.uniform('reg_lambda', 0, 10),
        'bagging_freq': hp.randint('bagging_freq', 1, 10),
        'min_child_samples': hp.choice('min_child_samples', list(range(1, 30, 5)))
        }
    #fmin:Minimize a function over a hyperparameter space.
    params_best = fmin(
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=30,
        rstate=np.random.default_rng(2020))
    return params_best


In [None]:
tpe.suggest?


In [15]:
def train_predict(train, test, params):
    """

    :param train:
    :param test:
    :param params:
    :return:
    """
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')
    params = params_append(params)
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series(index=train.index)
    ESR = 30
    NBR = 10000
    VBE = 50
    callbacks = [lgb.log_evaluation(VBE),lgb.early_stopping(ESR)]
    for i,(train_part_index, val_index) in enumerate(kf.split(train[features], train[label])):
        # 模型训练
        train_part = lgb.Dataset(train[features].loc[train_part_index],
                                 train[label].loc[train_part_index])
        val_part = lgb.Dataset(train[features].loc[val_index],
                           train[label].loc[val_index])
        #这里的bst是boost不是best
        bst = lgb.train(params, train_part, num_boost_round=NBR,
                        valid_sets=[train_part, val_part],
                        valid_names=['train', 'valid'],
                        callbacks=callbacks)
        #k-fold的每一次都用来预测一下test
        prediction_test += bst.predict(test[features])
        
        #k-fold每一次都预测不同的val，拼在一次后相当于是预测了完整的train，实现了k-fold交叉验证法的样本内误差计算准备
        prediction_train = pd.concat([prediction_train,pd.Series(bst.predict(train[features].loc[val_index]),
                                                             index=val_index)],ignore_index=False)

        #计算每次对val预测得到的分数
        val_pre = bst.predict(train[features].loc[val_index])
        score = np.sqrt(mean_squared_error(train[label].loc[val_index].values, val_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    pd.Series(prediction_train.sort_index().values).to_csv(path+"preprocess/train_lightgbm.csv", index=False)
    pd.Series(prediction_test / 5).to_csv(path+"preprocess/test_lightgbm.csv", index=False)
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv(path+"result/submission_lightgbm.csv", index=False)
    return

if __name__ == "__main__":
    train, test = read_data(debug=False)
    train, test = feature_select_wrapper(train, test)
    best_clf = param_hyperopt(train)
    train_predict(train, test, best_clf)
# [3.686192535745703, 3.647032390847285, 3.706089838227353, 3.773664215095074, 3.5735473296458626] 3.677305261912256
# [3.671956482832931, 3.635001589234533, 3.696673220118413, 3.770417301661309, 3.5653634108368886] 3.667882400936814

read_data...
done
feature_select_wrapper...
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 227965
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 1637
[LightGBM] [Info] Start training from score -0.390986
Training until validation scores don't improve for 30 rounds
[50]	train's rmse: 3.43102	valid's rmse: 3.70777
Early stopping, best iteration is:
[57]	train's rmse: 3.41246	valid's rmse: 3.70645
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228067
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 1640
[LightGBM] [Info] Start training from score -0.396781
[50]	train's rmse: 3.44484	valid's rmse: 3.66146
Early stopping, best iteration is:
[66]	train's rmse: 3.39923	valid's rmse: 3.65861
You can set `




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66433                                              
[LightGBM] [Info] Number of data points in the train set: 100958, number of used features: 300
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66433                                              
[LightGBM] [Info] Number of data points in the train set: 100958, number of used features: 300
Training until validation scores don't improve for 20 rounds                    
Early stopping, best iteration is:                                              
[8]	cv_agg's rmse: 3.73163 + 0.00627336
{'bagging_fraction': 0.512262561313273, 'bagging_freq': 3, 'feature_fraction': 0.6864235320941958, 'learning_rate': 0.31527024380943564, 'min_child_samples': 21, 'num_leaves': 180, 'reg_alpha': 2, 'reg_

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66433                                              
[LightGBM] [Info] Number of data points in the train set: 100958, number of used features: 300
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66433                                              
[LightGBM] [Info] Number of data points in the train set: 100958, number of used features: 300
Training until validation scores don't improve for 20 rounds                    
Early stopping, best iteration is:                                              
[23]	cv_agg's rmse: 3.72112 + 0.00931603
{'bagging_fraction': 0.6843213475295423, 'bagging_freq': 4, 'feature_fraction': 0.7924623289134336, 'learning_rate': 0.3020635372075119, 'min_child_samples': 36, 'num_leaves': 120, 'reg_alpha': 6, 'reg_lambda': 2.0996363046714714, 'feature_pre_filter': False, 'obje

[LightGBM] [Info] Total Bins 66433                                              
[LightGBM] [Info] Number of data points in the train set: 100958, number of used features: 300
Training until validation scores don't improve for 20 rounds                    
Early stopping, best iteration is:                                              
[8]	cv_agg's rmse: 3.73846 + 0.00295492
{'bagging_fraction': 0.907813520674998, 'bagging_freq': 5, 'feature_fraction': 0.9451268493017603, 'learning_rate': 0.07796335300461185, 'min_child_samples': 33, 'num_leaves': 10, 'reg_alpha': 4, 'reg_lambda': 3.3416874863644837, 'feature_pre_filter': False, 'objective': 'regression', 'metric': 'rmse', 'bagging_seed': 2020}
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66433                                              
[LightGBM] [Info] Number of data points in the train set: 100958, number of used features: 30

[600]	train's rmse: 3.31938	valid's rmse: 3.63885
[650]	train's rmse: 3.29958	valid's rmse: 3.6383
[700]	train's rmse: 3.27816	valid's rmse: 3.63657
[750]	train's rmse: 3.25799	valid's rmse: 3.63608
[800]	train's rmse: 3.23897	valid's rmse: 3.63526
[850]	train's rmse: 3.22046	valid's rmse: 3.63504
Early stopping, best iteration is:
[847]	train's rmse: 3.22142	valid's rmse: 3.635
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66017
[LightGBM] [Info] Number of data points in the train set: 161534, number of used features: 300
[LightGBM] [Info] Start training from score -0.390348
[50]	train's rmse: 3.69367	valid's rmse: 3.77574
[100]	train's rmse: 3.61224	valid's rmse: 3.74338
[150]	train's rmse: 3.55484	valid's rmse: 3.72881
[200]	train's rmse: 3.51168	valid's rmse: 3.71968
[250]	train's rmse: 3.47679	valid's rmse: 3.71401
[300]	train's rmse: 3.44687	valid's rmse: 3.70931
[350]	train's

# 对比不采用贝叶斯调参的线下CV结果 3.6783
(相比baseline-0.0074，相比贝叶斯调参+0.0104）

In [21]:
def train_predict1(train, test, params):
    """

    :param train:
    :param test:
    :param params:
    :return:
    """
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')
    params = params_append(params)
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series(index=train.index)
    ESR = 30
    NBR = 10000
    VBE = 50
    callbacks = [lgb.log_evaluation(VBE),lgb.early_stopping(ESR)]
    for i,(train_part_index, val_index) in enumerate(kf.split(train[features], train[label])):
        # 模型训练
        train_part = lgb.Dataset(train[features].loc[train_part_index],
                                 train[label].loc[train_part_index])
        val_part = lgb.Dataset(train[features].loc[val_index],
                           train[label].loc[val_index])
        #这里的bst是boost不是best
        bst = lgb.train(params, train_part, num_boost_round=NBR,
                        valid_sets=[train_part, val_part],
                        valid_names=['train', 'valid'],
                        callbacks=callbacks)
        #k-fold的每一次都用来预测一下test
        prediction_test += bst.predict(test[features])
        
        #k-fold每一次都预测不同的val，拼在一次后相当于是预测了完整的train，实现了k-fold交叉验证法的样本内误差计算准备
        prediction_train = pd.concat([prediction_train,pd.Series(bst.predict(train[features].loc[val_index]),
                                                             index=val_index)],ignore_index=False)

        #计算每次对val预测得到的分数
        val_pre = bst.predict(train[features].loc[val_index])
        score = np.sqrt(mean_squared_error(train[label].loc[val_index].values, val_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    return

if __name__ == "__main__":
    train, test = read_data(debug=False)
    train, test = feature_select_wrapper(train, test)
    params_initial = {
        'num_leaves': 31,
        'learning_rate': 0.05,
        'boosting': 'gbdt',
        'min_child_samples': 20,
        'bagging_seed': 2020,
        'bagging_fraction': 0.7, #行采样
        'bagging_freq': 1,#重新采样的频率，每k次迭代后重新按照bagging_fraction比例进行行采样
        'feature_fraction': 0.7, #列采样
        'max_depth': -1,
        'metric': 'rmse',
        'reg_alpha': 0,
        'reg_lambda': 1,
        'objective': 'regression'
    }
    train_predict1(train, test, params_initial)


read_data...
done
feature_select_wrapper...
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 227965
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 1637
[LightGBM] [Info] Start training from score -0.390986
Training until validation scores don't improve for 100 rounds
[50]	train's rmse: 3.43102	valid's rmse: 3.70777
[100]	train's rmse: 3.29549	valid's rmse: 3.70918
[150]	train's rmse: 3.2017	valid's rmse: 3.71565
Early stopping, best iteration is:
[57]	train's rmse: 3.41246	valid's rmse: 3.70645
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 228067
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 1640
[LightGBM] [Info] Start training from score -0.396781
[50]	train's rmse: 3.44484	valid's rmse: 3.66146
[100]	train's rmse: 3.31554	valid's rmse: 3.66409
[150]	train's rmse: 3.21636	valid's rmse: 3.66967
Early stopping, best iter