<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#bayesopt-3.6670-(-0.0187)" data-toc-modified-id="bayesopt-3.6670-(-0.0187)-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>bayesopt 3.6670 (-0.0187)</a></span></li><li><span><a href="#hyperopt--3.6704" data-toc-modified-id="hyperopt--3.6704-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>hyperopt  3.6704</a></span></li><li><span><a href="#XGB-v2.0" data-toc-modified-id="XGB-v2.0-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>XGB v2.0</a></span></li></ul></div>

In [1]:
"""
特征组合：Dict+GroupBy+nlp
特征选择方式：chi2
参数寻优办法：beyesian
模型：xgboost
"""
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe
from scipy import sparse
from sklearn.feature_selection import f_regression
from numpy.random import RandomState
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

path = '/Volumes/U397/机器学习/机器学习比赛/机器学习算法竞赛实战/第8章 kaggle 信用卡忠诚度预测/'

In [2]:
def read_data(debug=True):
    """

    :param debug:
    :return:
    """
    print("read_data...")
    NROWS = 10000 if debug else None
    train_dict = pd.read_csv(path+"preprocess/train_dict.csv", nrows=NROWS)
    test_dict = pd.read_csv(path+"preprocess/test_dict.csv", nrows=NROWS)
    train_groupby = pd.read_csv(path+"preprocess/train_groupby.csv", nrows=NROWS)
    test_groupby = pd.read_csv(path+"preprocess/test_groupby.csv", nrows=NROWS)

    # 去除重复列
    for co in train_dict.columns:
        if co in train_groupby.columns and co!='card_id':
            del train_groupby[co]
    for co in test_dict.columns:
        if co in test_groupby.columns and co!='card_id':
            del test_groupby[co]

    train = pd.merge(train_dict, train_groupby, how='left', on='card_id').fillna(0)
    test = pd.merge(test_dict, test_groupby, how='left', on='card_id').fillna(0)

    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')

    train_x = sparse.load_npz(path+"preprocess/train_nlp.npz")
    test_x = sparse.load_npz(path+"preprocess/test_nlp.npz")

    train_x = sparse.hstack((train_x, train[features])).tocsr()#合并成sparse矩阵
    test_x = sparse.hstack((test_x, test[features])).tocsr()#合并成sparse矩阵
    print("done")
    return train_x, test_x


# bayesopt 3.6670 (-0.0187)

In [3]:
def params_append(params):
    """
    :param params:
    :return:
    """
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params["min_child_weight"] = int(params["min_child_weight"]) #改成整数类型
    params['max_depth'] = int(params['max_depth']) #改成整数类型
    return params


def param_beyesian(train):
    """
    :param train:
    :return:
    """
    train_y = pd.read_csv(path+"data/train.csv")['target']
    train_data = xgb.DMatrix(train.tocsr(),
                             train_y.values, 
                             silent=True)
    
    def bayesopt_objective(colsample_bytree, subsample,
                           min_child_weight, max_depth,
                           reg_alpha, reg_lambda, eta):#优化的目标函数
        params = {'objective': 'reg:squarederror',
                  'eval_metric': 'rmse'}
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0) #这里是确保为正
        params['subsample'] = max(min(subsample, 1), 0) #这里是确保为正
        params["min_child_weight"] = int(min_child_weight) #这里是确保为整数
        params['max_depth'] = int(max_depth) #这里是确保为整数
        params['eta'] = float(eta)
        params['reg_alpha'] = max(reg_alpha, 0)  #这里是确保为整数
        params['reg_lambda'] = max(reg_lambda, 0)  #这里是确保为整数
        #print(params)
        
        cv_result = xgb.cv(params, train_data,
                           num_boost_round=1000,
                           nfold=2, seed=2,
                           stratified=False,
                           shuffle=True,
                           early_stopping_rounds=30,
                           verbose_eval=False)
        #cv_result记录了每一颗树的eval_metric,因此test-rmse-mean中的最小值即对应最后的最优结果
        return -min(cv_result['test-rmse-mean'])#bayes-opt是最大化，所以这里加负号 
    
    xgb_bo = BayesianOptimization(
        bayesopt_objective,
        {'colsample_bytree': (0.5, 1),
         'subsample': (0.5, 1),
         'min_child_weight': (1, 30),
         'max_depth': (5, 12),
         'reg_alpha': (0, 5),
         'eta':(0.02, 0.2),
         'reg_lambda': (0, 5)}
    )
    xgb_bo.maximize(init_points=21, n_iter=5)  # init_points表示初始点，n_iter代表迭代次数（即采样数）
    print(xgb_bo.max['target'], xgb_bo.max['params'])
    return xgb_bo.max['params']


In [4]:
def train_predict(train, test, params):
    """

    :param train:
    :param test:
    :param params:
    :return:
    """
    train_y = pd.read_csv(path+"data/train.csv")['target']
    test_data = xgb.DMatrix(test)

    params = params_append(params)
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series(None,index=train_y.index)
    ESR = 30     #early_stopping_rounds
    NBR = 10000  #num_boost_rounds
    VBE = 50     #verbose_eval
    for i,(train_part_index, val_index) in enumerate(kf.split(train, train_y)):
        # 模型训练
        train_part = xgb.DMatrix(train.tocsr()[train_part_index, :],
                                 train_y.loc[train_part_index])
        val_part = xgb.DMatrix(train.tocsr()[val_index, :],
                           train_y.loc[val_index])
        #bst是boost，不是best
        bst = xgb.train(params, train_part, NBR, 
                        evals=[(train_part, 'train'),(val_part, 'eval')], 
                        verbose_eval=VBE,
                        maximize=False, 
                        early_stopping_rounds=ESR)
        prediction_test += bst.predict(test_data)
        val_pre = bst.predict(val_part)
        prediction_train = pd.concat([prediction_train,pd.Series(val_pre, index=val_index)],ignore_index=False)
        score = np.sqrt(mean_squared_error(train_y.loc[val_index].values, val_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    pd.Series(prediction_train.sort_index().values).to_csv(path+"preprocess/train_xgboost.csv", index=False)
    pd.Series(prediction_test / 5).to_csv(path+"preprocess/test_xgboost.csv", index=False)
    test = pd.read_csv(path+'data/test.csv')
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv(path+"result/submission_xgboost.csv", index=False)
    return

if __name__ == "__main__":
    train, test = read_data(debug=False)
    best_clf = param_beyesian(train)
    train_predict(train, test, best_clf)
# [3.6799306462307517, 3.6476521867457588, 3.698480976611057, 3.7718461304040853, 3.579301270046094] 3.6754422420075494

read_data...
done
|   iter    |  target   | colsam... |    eta    | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'colsample_bytree': 0.7308046880994747, 'subsample': 0.5896472861648605, 'min_child_weight': 21, 'max_depth': 6, 'eta': 0.17699413134901845, 'reg_alpha': 1.5650410736509723, 'reg_lambda': 2.6196802076219017}
| [0m1        [0m | [0m-3.702   [0m | [0m0.7308   [0m | [0m0.177    [0m | [0m6.68     [0m | [0m21.67    [0m | [0m1.565    [0m | [0m2.62     [0m | [0m0.5896   [0m |
{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'colsample_bytree': 0.5684544762437197, 'subsample': 0.9788778842283206, 'min_child_weight': 25, 'max_depth': 6, 'eta': 0.12319646416386952, 'reg_alpha': 0.7311744821287564, 'reg_lambda': 1.878122346316694}
| [95m2        [0m | [95m-3.691   [0m | [95m0.

| [0m19       [0m | [0m-3.708   [0m | [0m0.9013   [0m | [0m0.1702   [0m | [0m6.035    [0m | [0m3.267    [0m | [0m1.647    [0m | [0m3.11     [0m | [0m0.8131   [0m |
{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'colsample_bytree': 0.8761978542487143, 'subsample': 0.747960695780887, 'min_child_weight': 3, 'max_depth': 7, 'eta': 0.060787753707893216, 'reg_alpha': 2.03884731019329, 'reg_lambda': 1.9990651610140193}
| [0m20       [0m | [0m-3.703   [0m | [0m0.8762   [0m | [0m0.06079  [0m | [0m7.088    [0m | [0m3.271    [0m | [0m2.039    [0m | [0m1.999    [0m | [0m0.748    [0m |
{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'colsample_bytree': 0.92991990784699, 'subsample': 0.5681039794824314, 'min_child_weight': 27, 'max_depth': 10, 'eta': 0.14053243050532688, 'reg_alpha': 0.5110002800372249, 'reg_lambda': 2.5988773394678337}
| [0m21       [0m | [0m-3.713   [0m | [0m0.9299   [0m | [0m0.1405   [0m | [0m10.17    [0m | [0m27.9

# hyperopt  3.6704

In [13]:
def params_append1(params):
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    return params

def param_hyperopt(train):
    """
    返回最佳参数
    :param train:
    :return:
    """
    train_y = pd.read_csv(path+"data/train.csv")['target']
    train_data = xgb.DMatrix(train.tocsr(),train_y.values, silent=True)
    def hyperopt_objective(params):#优化的目标函数
        """
        :param params:
        :return:
        """
        params = params_append1(params)
        cv_result = xgb.cv(params, train_data,
                           num_boost_round=1000,
                           nfold=2, seed=2,
                           stratified=False,
                           shuffle=True,
                           early_stopping_rounds=30,
                           verbose_eval=False)
        return min(cv_result['test-rmse-mean'])
    
    params_space = {
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
        'subsample': hp.uniform('subsample',0.5, 1),
        'min_child_weight': hp.randint('min_child_weight',1, 30),
        'max_depth': hp.randint('max_depth', 5, 12),
        'reg_alpha': hp.randint('reg_alpha', 0, 5),
        'eta': hp.uniform('eta',0.02, 0.2),
        'reg_lambda': hp.uniform('reg_lambda',0, 5)
        }
    #fmin:Minimize a function over a hyperparameter space.
    params_best = fmin(
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=30,
        rstate=np.random.default_rng(2020))
    return params_best

def train_predict1(train, test, params):
    train_y = pd.read_csv(path+"data/train.csv")['target']
    test_data = xgb.DMatrix(test)

    params = params_append1(params)
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series(None,index=train_y.index)
    ESR = 30     #early_stopping_rounds
    NBR = 10000  #num_boost_rounds
    VBE = 50     #verbose_eval
    for i,(train_part_index, val_index) in enumerate(kf.split(train, train_y)):
        # 模型训练
        train_part = xgb.DMatrix(train.tocsr()[train_part_index, :],
                                 train_y.loc[train_part_index])
        val_part = xgb.DMatrix(train.tocsr()[val_index, :],
                           train_y.loc[val_index])
        #bst是boost，不是best
        bst = xgb.train(params, train_part, NBR, 
                        evals=[(train_part, 'train'),(val_part, 'eval')], 
                        verbose_eval=VBE,
                        maximize=False, 
                        early_stopping_rounds=ESR)
        prediction_test += bst.predict(test_data)
        val_pre = bst.predict(val_part)
        prediction_train = pd.concat([prediction_train,pd.Series(val_pre, index=val_index)],ignore_index=False)
        score = np.sqrt(mean_squared_error(train_y.loc[val_index].values, val_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    return

if __name__ == "__main__":
    #train, test = read_data(debug=False)
    best_clf = param_hyperopt(train)
    train_predict1(train, test, best_clf)

100%|█████| 30/30 [10:27:13<00:00, 1254.44s/trial, best loss: 3.679107027409163]
[0]	train-rmse:3.94069	eval-rmse:3.94715
[50]	train-rmse:3.68929	eval-rmse:3.73195
[100]	train-rmse:3.62835	eval-rmse:3.69849
[150]	train-rmse:3.60638	eval-rmse:3.68934
[200]	train-rmse:3.59285	eval-rmse:3.68547
[250]	train-rmse:3.58136	eval-rmse:3.68345
[300]	train-rmse:3.57030	eval-rmse:3.68195
[350]	train-rmse:3.55974	eval-rmse:3.68106
[384]	train-rmse:3.55366	eval-rmse:3.68090
[0]	train-rmse:3.94766	eval-rmse:3.92004
[50]	train-rmse:3.69998	eval-rmse:3.69647
[100]	train-rmse:3.63775	eval-rmse:3.65773
[150]	train-rmse:3.61847	eval-rmse:3.64928
[200]	train-rmse:3.60181	eval-rmse:3.64466
[250]	train-rmse:3.59000	eval-rmse:3.64243
[300]	train-rmse:3.58148	eval-rmse:3.64084
[350]	train-rmse:3.57150	eval-rmse:3.63973
[400]	train-rmse:3.56219	eval-rmse:3.63906
[426]	train-rmse:3.55865	eval-rmse:3.63917
[0]	train-rmse:3.93586	eval-rmse:3.96612
[50]	train-rmse:3.68526	eval-rmse:3.74898
[100]	train-rmse:3.62338	

# XGB v2.0

In [None]:
def params_append2(params):
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params['tree_method'] = 'hist' #近似树构建算法，在不牺牲模型精度的情况下显着提高了训练速度
    
    return params

def param_hyperopt(train):
    """
    返回最佳参数
    :param train:
    :return:
    """
    train_y = pd.read_csv(path+"data/train.csv")['target']
    train_data = xgb.DMatrix(train.tocsr(),train_y.values, silent=True)
    def hyperopt_objective(params):#优化的目标函数
        """
        :param params:
        :return:
        """
        params = params_append2(params)
        cv_result = xgb.cv(params, train_data,
                           num_boost_round=1000,
                           nfold=2, seed=2,
                           stratified=False,
                           shuffle=True,
                           early_stopping_rounds=30,
                           verbose_eval=50)
        return min(cv_result['test-rmse-mean'])
    
    params_space = {
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
        'subsample': hp.uniform('subsample',0.5, 1),
        'min_child_weight': hp.randint('min_child_weight',1, 30),
        'max_depth': hp.randint('max_depth', 5, 12),
        'reg_alpha': hp.randint('reg_alpha', 0, 5),
        'eta': hp.uniform('eta',0.02, 0.2),
        'reg_lambda': hp.uniform('reg_lambda',0, 5)
        }
    #fmin:Minimize a function over a hyperparameter space.
    params_best = fmin(
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=30,
        rstate=np.random.default_rng(2020))
    return params_best

def train_predict1(train, test, params):
    train_y = pd.read_csv(path+"data/train.csv")['target']
    test_data = xgb.DMatrix(test)

    params = params_append2(params) #加上其他不参与贝叶斯优化的参数 
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series(None,index=train_y.index)
    ESR = 30     #early_stopping_rounds
    NBR = 10000  #num_boost_rounds
    VBE = 50     #verbose_eval
    for i,(train_part_index, val_index) in enumerate(kf.split(train, train_y)):
        # 模型训练
        train_part = xgb.DMatrix(train.tocsr()[train_part_index, :],
                                 train_y.loc[train_part_index])
        val_part = xgb.DMatrix(train.tocsr()[val_index, :],
                           train_y.loc[val_index])
        #bst是boost，不是best
        bst = xgb.train(params, train_part, NBR, 
                        evals=[(train_part, 'train'),(val_part, 'eval')], 
                        verbose_eval=VBE,
                        maximize=False, 
                        early_stopping_rounds=ESR)
        prediction_test += bst.predict(test_data)
        val_pre = bst.predict(val_part)
        prediction_train = pd.concat([prediction_train,pd.Series(val_pre, index=val_index)],ignore_index=False)
        score = np.sqrt(mean_squared_error(train_y.loc[val_index].values, val_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    return

if __name__ == "__main__":
    train, test = read_data(debug=False)
    best_clf = param_hyperopt(train)
    print(best_clf)
    train_predict1(train, test, best_clf)

read_data...
done
[0]	train-rmse:3.91971+0.02157	test-rmse:3.93268+0.02349                                         
[50]	train-rmse:3.32060+0.01485	test-rmse:3.70064+0.02055                                        
[100]	train-rmse:3.24954+0.01840	test-rmse:3.69527+0.01917                                       
[137]	train-rmse:3.21749+0.02106	test-rmse:3.69534+0.01839                                       
[0]	train-rmse:3.89321+0.02140	test-rmse:3.91504+0.02290                                         
[50]	train-rmse:3.28956+0.01383	test-rmse:3.69665+0.01898                                        
[82]	train-rmse:3.24005+0.00916	test-rmse:3.69750+0.01815                                        
[0]	train-rmse:3.88858+0.02065	test-rmse:3.91057+0.02335                                         
[50]	train-rmse:3.30672+0.01401	test-rmse:3.69560+0.01919                                        
[75]	train-rmse:3.26349+0.01966	test-rmse:3.69712+0.01864                                        
 1

In [None]:
import xgboost as xgb
from scipy.sparse import csr_matrix
# Convert data to a sparse matrix
sparse_data = csr_matrix(data)
# Train XGBoost model with sparse data
dtrain = xgb.DMatrix(sparse_data, label=labels)
model = xgb.train(params, dtrain)

In [4]:
xgb.__version__

'1.7.3'

In [3]:
import gc
gc.collect()

20