In [1]:
"""
特征组合：Dict+GroupBy+nlp
特征选择方式：chi2
参数寻优办法：beyesian
模型：xgboost
"""
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe
from scipy import sparse
from sklearn.feature_selection import f_regression
from numpy.random import RandomState
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

path = '/Volumes/U397/机器学习/机器学习比赛/机器学习算法竞赛实战/第8章 kaggle 信用卡忠诚度预测/'

In [2]:
def read_data(debug=True):
    """

    :param debug:
    :return:
    """
    print("read_data...")
    NROWS = 10000 if debug else None
    train_dict = pd.read_csv(path+"preprocess/train_dict.csv", nrows=NROWS)
    test_dict = pd.read_csv(path+"preprocess/test_dict.csv", nrows=NROWS)
    train_groupby = pd.read_csv(path+"preprocess/train_groupby.csv", nrows=NROWS)
    test_groupby = pd.read_csv(path+"preprocess/test_groupby.csv", nrows=NROWS)

    # 去除重复列
    for co in train_dict.columns:
        if co in train_groupby.columns and co!='card_id':
            del train_groupby[co]
    for co in test_dict.columns:
        if co in test_groupby.columns and co!='card_id':
            del test_groupby[co]

    train = pd.merge(train_dict, train_groupby, how='left', on='card_id').fillna(0)
    test = pd.merge(test_dict, test_groupby, how='left', on='card_id').fillna(0)

    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')

    train_x = sparse.load_npz(path+"preprocess/train_nlp.npz")
    test_x = sparse.load_npz(path+"preprocess/test_nlp.npz")

    train_x = sparse.hstack((train_x, train[features])).tocsr()#合并成sparse矩阵
    test_x = sparse.hstack((test_x, test[features])).tocsr()#合并成sparse矩阵
    print("done")
    return train_x, test_x


# bayesopt

In [3]:
def params_append(params):
    """
    :param params:
    :return:
    """
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params["min_child_weight"] = int(params["min_child_weight"]) #改成整数类型
    params['max_depth'] = int(params['max_depth']) #改成整数类型
    return params


def param_beyesian(train):
    """
    :param train:
    :return:
    """
    train_y = pd.read_csv(path+"data/train.csv")['target']
    train_data = xgb.DMatrix(train.tocsr(),
                             train_y.values, 
                             silent=True)
    
    def bayesopt_objective(colsample_bytree, subsample,
                           min_child_weight, max_depth,
                           reg_alpha, reg_lambda, eta):#优化的目标函数
        params = {'objective': 'reg:squarederror',
                  'eval_metric': 'rmse'}
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0) #这里是确保为正
        params['subsample'] = max(min(subsample, 1), 0) #这里是确保为正
        params["min_child_weight"] = int(min_child_weight) #这里是确保为整数
        params['max_depth'] = int(max_depth) #这里是确保为整数
        params['eta'] = float(eta)
        params['reg_alpha'] = max(reg_alpha, 0)  #这里是确保为整数
        params['reg_lambda'] = max(reg_lambda, 0)  #这里是确保为整数
        #print(params)
        
        cv_result = xgb.cv(params, train_data,
                           num_boost_round=1000,
                           nfold=2, seed=2,
                           stratified=False,
                           shuffle=True,
                           early_stopping_rounds=30,
                           verbose_eval=False)
        #cv_result记录了每一颗树的eval_metric,因此test-rmse-mean中的最小值即对应最后的最优结果
        return -min(cv_result['test-rmse-mean'])#bayes-opt是最大化，所以这里加负号 
    
    xgb_bo = BayesianOptimization(
        bayesopt_objective,
        {'colsample_bytree': (0.5, 1),
         'subsample': (0.5, 1),
         'min_child_weight': (1, 30),
         'max_depth': (5, 12),
         'reg_alpha': (0, 5),
         'eta':(0.02, 0.2),
         'reg_lambda': (0, 5)}
    )
    xgb_bo.maximize(init_points=21, n_iter=5)  # init_points表示初始点，n_iter代表迭代次数（即采样数）
    print(xgb_bo.max['target'], xgb_bo.max['params'])
    return xgb_bo.max['params']


In [None]:
def train_predict(train, test, params):
    """

    :param train:
    :param test:
    :param params:
    :return:
    """
    train_y = pd.read_csv(path+"data/train.csv")['target']
    test_data = xgb.DMatrix(test)

    params = params_append(params)
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series(None,index=train_y.index)
    ESR = 30     #early_stopping_rounds
    NBR = 10000  #num_boost_rounds
    VBE = 50     #verbose_eval
    for i,(train_part_index, val_index) in enumerate(kf.split(train, train_y)):
        # 模型训练
        train_part = xgb.DMatrix(train.tocsr()[train_part_index, :],
                                 train_y.loc[train_part_index])
        val_part = xgb.DMatrix(train.tocsr()[val_index, :],
                           train_y.loc[val_index])
        #bst是boost，不是best
        bst = xgb.train(params, train_part, NBR, 
                        evals=[(train_part, 'train'),(val_part, 'eval')], 
                        verbose_eval=VBE,
                        maximize=False, 
                        early_stopping_rounds=ESR)
        prediction_test += bst.predict(test_data)
        val_pre = bst.predict(val_part)
        prediction_train = pd.concat([prediction_train,pd.Series(val_pre, index=val_index)],ignore_index=False)
        score = np.sqrt(mean_squared_error(train_y.loc[val_index].values, val_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    pd.Series(prediction_train.sort_index().values).to_csv(path+"preprocess/train_xgboost.csv", index=False)
    pd.Series(prediction_test / 5).to_csv(path+"preprocess/test_xgboost.csv", index=False)
    test = pd.read_csv(path+'data/test.csv')
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv(path+"result/submission_xgboost.csv", index=False)
    return

if __name__ == "__main__":
    train, test = read_data(debug=False)
    best_clf = param_beyesian(train)
    train_predict(train, test, best_clf)
# [3.6799306462307517, 3.6476521867457588, 3.698480976611057, 3.7718461304040853, 3.579301270046094] 3.6754422420075494

read_data...
done
|   iter    |  target   | colsam... |    eta    | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'colsample_bytree': 0.7308046880994747, 'subsample': 0.5896472861648605, 'min_child_weight': 21, 'max_depth': 6, 'eta': 0.17699413134901845, 'reg_alpha': 1.5650410736509723, 'reg_lambda': 2.6196802076219017}
| [0m1        [0m | [0m-3.702   [0m | [0m0.7308   [0m | [0m0.177    [0m | [0m6.68     [0m | [0m21.67    [0m | [0m1.565    [0m | [0m2.62     [0m | [0m0.5896   [0m |
{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'colsample_bytree': 0.5684544762437197, 'subsample': 0.9788778842283206, 'min_child_weight': 25, 'max_depth': 6, 'eta': 0.12319646416386952, 'reg_alpha': 0.7311744821287564, 'reg_lambda': 1.878122346316694}
| [95m2        [0m | [95m-3.691   [0m | [95m0.

# 对比hyperopt 施工中

In [None]:
def params_append1(params):
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    return params

def param_hyperopt(train):
    """
    返回最佳参数
    :param train:
    :return:
    """
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')
    train_y = pd.read_csv(path+"data/train.csv")['target']
    train_data = xgb.DMatrix(train.tocsr(),train_y.values, silent=True)
    def hyperopt_objective(params):#优化的目标函数
        """
        :param params:
        :return:
        """
        params = params_append(params)
        cv_result = xgb.cv(params, train_data,
                           num_boost_round=1000,
                           nfold=2, seed=2,
                           stratified=False,
                           shuffle=True,
                           early_stopping_rounds=30,
                           verbose_eval=False)
        return min(cv_result['test-rmse-mean'])
    
    params_space = {
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
        'subsample': hp.uniform('subsample',0.5, 1),
        'min_child_weight': hp.randint('min_child_weight',1, 30),
        'max_depth': hp.randint('max_depth', 5, 12),
        'reg_alpha': hp.randint('reg_alpha', 0, 5),
        'eta': hp.uniform('eta',0.02, 0.2),
        'reg_lambda': hp.uniform('reg_lambda',0, 5)
        }
    #fmin:Minimize a function over a hyperparameter space.
    params_best = fmin(
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=30,
        rstate=np.random.default_rng(2020))
    return params_best

def train_predict1(train, test, params):
    train_y = pd.read_csv(path+"data/train.csv")['target']
    test_data = xgb.DMatrix(test)

    params = params_append1(params)
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series(None,index=train_y.index)
    ESR = 30     #early_stopping_rounds
    NBR = 10000  #num_boost_rounds
    VBE = 50     #verbose_eval
    for i,(train_part_index, val_index) in enumerate(kf.split(train, train_y)):
        # 模型训练
        train_part = xgb.DMatrix(train.tocsr()[train_part_index, :],
                                 train_y.loc[train_part_index])
        val_part = xgb.DMatrix(train.tocsr()[val_index, :],
                           train_y.loc[val_index])
        #bst是boost，不是best
        bst = xgb.train(params, train_part, NBR, 
                        evals=[(train_part, 'train'),(val_part, 'eval')], 
                        verbose_eval=VBE,
                        maximize=False, 
                        early_stopping_rounds=ESR)
        prediction_test += bst.predict(test_data)
        val_pre = bst.predict(val_part)
        prediction_train = pd.concat([prediction_train,pd.Series(val_pre, index=val_index)],ignore_index=False)
        score = np.sqrt(mean_squared_error(train_y.loc[val_index].values, val_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    return

if __name__ == "__main__":
    train, test = read_data(debug=False)
    best_clf = param_hyperopt(train)
    train_predict1(train, test, best_clf)