# 模型训练/调参/预测

In [1]:
import os.path

from constant import *
from sklearn.metrics import roc_curve
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
import lightgbm as lgb
from data import loader,exporter
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
def ks_stat(y_true, y_pred):
    """计算KS值的自定义评估函数"""
    fpr, tpr, _ = roc_curve(y_true, y_pred)  # 计算ROC曲线
    ks_value = np.max(np.abs(tpr - fpr))  # KS统计量
    return ks_value


def lgb_ks_eval(y_pred, dataset):
    """用于LightGBM的自定义KS评估函数"""
    y_true = dataset.get_label()
    ks_value = ks_stat(y_true, y_pred)
    # 返回 (名称, 计算的 KS 值, 是否越高越好)
    return 'ks', ks_value, True


# 自定义目标函数，用于贝叶斯优化
def objective(params, X, y, n_folds=5):
    """贝叶斯优化的目标函数，返回负的整体验证集KS分数"""
    # 设置模型参数
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'binary'
    params['verbose'] = -1
    params['early_stopping_round'] = 20
    params['n_estimators'] = 10000

    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2023)

    # 存储每一折的预测结果
    final_predictions = np.zeros(len(X))

    for train_idx, valid_idx in kf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        # 建立LightGBM训练集
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

        # 训练LightGBM模型
        model = lgb.train(params,
                          lgb_train,
                          valid_sets=[lgb_train, lgb_valid],
                          feval=lgb_ks_eval,
                          callbacks=[lgb.early_stopping(stopping_rounds=50),
                                     lgb.log_evaluation(10)])

        # 验证集预测概率
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

        # 将每一折的预测结果填入到相应的索引位置
        final_predictions[valid_idx] = y_pred

    # 计算整体的KS分数
    overall_score = ks_stat(y, final_predictions)

    # 返回负的KS分数作为最小化目标
    return {'loss': -overall_score, 'status': STATUS_OK}


# 超参数空间
param_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),  # 学习率
    'num_leaves': hp.choice('num_leaves', np.arange(20, 150, dtype=int)),  # 叶子数
    'max_depth': hp.choice('max_depth', np.arange(3, 12, dtype=int)),  # 树的最大深度
    'min_child_weight': hp.uniform('min_child_weight', 0.001, 10),  # 子叶节点的最小权重
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),  # 样本列采样率
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 20),
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # 样本采样率
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),  # L1正则化
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0)  # L2正则化
}


# 贝叶斯优化调参函数
def bayesian_optimize_lgbm(X, y, param_space, max_evals=10):
    trials = Trials()

    # 使用fmin函数进行贝叶斯优化
    best_params = fmin(
        fn=lambda params: objective(params, X, y),  # 优化目标
        space=param_space,  # 参数空间
        algo=tpe.suggest,  # 使用TPE算法
        max_evals=max_evals,  # 最大评估次数
        trials=trials  # 记录每次评估的结果
    )

    return best_params, trials

In [3]:
# 读取数据
# df_target = pd.read_csv(f'{dir_preprocess}/target.csv')
# df_flat = pd.read_csv(f'{dir_preprocess}/expend_1.csv')
# df_v1 = pd.read_csv(f'{dir_preprocess}/v1.csv')
# df = df_target.merge(df_flat, left_on=['CUST_NO'], right_on=['CUST_NO'], how='left').merge(df_v1, left_on=['CUST_NO','SRC','FLAG'], right_on=['CUST_NO','SRC','FLAG'], how='left')

X = loader.to_df(os.path.join(dir_preprocess,'v2.csv'))
y = X.pop("FLAG")  # 目标标签列

# 执行贝叶斯优化
best_params, trials = bayesian_optimize_lgbm(X, y, param_space, max_evals=1)

print("Best parameters found:", best_params)

  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]




Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.332345	training's ks: 0.910028	valid_1's binary_logloss: 0.341138	valid_1's ks: 0.898358
[20]	training's binary_logloss: 0.22029	training's ks: 0.938615	valid_1's binary_logloss: 0.232967	valid_1's ks: 0.928791
[30]	training's binary_logloss: 0.171161	training's ks: 0.950853	valid_1's binary_logloss: 0.186766	valid_1's ks: 0.938443
[40]	training's binary_logloss: 0.142482	training's ks: 0.956221	valid_1's binary_logloss: 0.160914	valid_1's ks: 0.943564
[50]	training's binary_logloss: 0.127313	training's ks: 0.959372	valid_1's binary_logloss: 0.148059	valid_1's ks: 0.945239
[60]	training's binary_logloss: 0.113568	training's ks: 0.963829	valid_1's binary_logloss: 0.137219	valid_1's ks: 0.947602
[70]	training's binary_logloss: 0.102978	training's ks: 0.966932	valid_1's binary_logloss: 0.129256	valid_1's ks: 0.948981
[80]	training's binary_logloss: 0.0943909	training's ks: 0.970699	valid_1's bin




Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.337443	training's ks: 0.906457	valid_1's binary_logloss: 0.346554	valid_1's ks: 0.897575
[20]	training's binary_logloss: 0.225204	training's ks: 0.937827	valid_1's binary_logloss: 0.23841	valid_1's ks: 0.929977
[30]	training's binary_logloss: 0.173382	training's ks: 0.949005	valid_1's binary_logloss: 0.189829	valid_1's ks: 0.940516
[40]	training's binary_logloss: 0.143597	training's ks: 0.954743	valid_1's binary_logloss: 0.163282	valid_1's ks: 0.94544
[50]	training's binary_logloss: 0.128113	training's ks: 0.958559	valid_1's binary_logloss: 0.150372	valid_1's ks: 0.946818
[60]	training's binary_logloss: 0.113332	training's ks: 0.962671	valid_1's binary_logloss: 0.138709	valid_1's ks: 0.948887
[70]	training's binary_logloss: 0.103297	training's ks: 0.966365	valid_1's binary_logloss: 0.130915	valid_1's ks: 0.949478
[80]	training's binary_logloss: 0.0955634	training's ks: 0.969566	valid_1's bina




Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.338616	training's ks: 0.907025	valid_1's binary_logloss: 0.344045	valid_1's ks: 0.898749
[20]	training's binary_logloss: 0.22559	training's ks: 0.938616	valid_1's binary_logloss: 0.235317	valid_1's ks: 0.932729
[30]	training's binary_logloss: 0.171961	training's ks: 0.94945	valid_1's binary_logloss: 0.186429	valid_1's ks: 0.942973
[40]	training's binary_logloss: 0.143303	training's ks: 0.954941	valid_1's binary_logloss: 0.161252	valid_1's ks: 0.946617
[50]	training's binary_logloss: 0.125248	training's ks: 0.959102	valid_1's binary_logloss: 0.146443	valid_1's ks: 0.94839
[60]	training's binary_logloss: 0.111361	training's ks: 0.963263	valid_1's binary_logloss: 0.136091	valid_1's ks: 0.950064
[70]	training's binary_logloss: 0.101392	training's ks: 0.967449	valid_1's binary_logloss: 0.128192	valid_1's ks: 0.95095
[80]	training's binary_logloss: 0.0927891	training's ks: 0.971167	valid_1's binary




Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.337104	training's ks: 0.90865	valid_1's binary_logloss: 0.341105	valid_1's ks: 0.902295
[20]	training's binary_logloss: 0.22538	training's ks: 0.938542	valid_1's binary_logloss: 0.233273	valid_1's ks: 0.934699
[30]	training's binary_logloss: 0.173772	training's ks: 0.949918	valid_1's binary_logloss: 0.185579	valid_1's ks: 0.94445
[40]	training's binary_logloss: 0.144271	training's ks: 0.955556	valid_1's binary_logloss: 0.159764	valid_1's ks: 0.948488
[50]	training's binary_logloss: 0.127172	training's ks: 0.959299	valid_1's binary_logloss: 0.14567	valid_1's ks: 0.950261
[60]	training's binary_logloss: 0.116241	training's ks: 0.961564	valid_1's binary_logloss: 0.136783	valid_1's ks: 0.951738
[70]	training's binary_logloss: 0.104326	training's ks: 0.965578	valid_1's binary_logloss: 0.127095	valid_1's ks: 0.952822
[80]	training's binary_logloss: 0.095453	training's ks: 0.96996	valid_1's binary_l




Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.337161	training's ks: 0.903602	valid_1's binary_logloss: 0.344037	valid_1's ks: 0.896385
[20]	training's binary_logloss: 0.225984	training's ks: 0.937089	valid_1's binary_logloss: 0.235649	valid_1's ks: 0.931646
[30]	training's binary_logloss: 0.175532	training's ks: 0.94812	valid_1's binary_logloss: 0.190038	valid_1's ks: 0.943957
[40]	training's binary_logloss: 0.146352	training's ks: 0.953168	valid_1's binary_logloss: 0.164539	valid_1's ks: 0.948094
[50]	training's binary_logloss: 0.128251	training's ks: 0.957772	valid_1's binary_logloss: 0.149308	valid_1's ks: 0.951246
[60]	training's binary_logloss: 0.116569	training's ks: 0.960555	valid_1's binary_logloss: 0.140293	valid_1's ks: 0.952428
[70]	training's binary_logloss: 0.105514	training's ks: 0.963977	valid_1's binary_logloss: 0.132399	valid_1's ks: 0.953511
[80]	training's binary_logloss: 0.0983528	training's ks: 0.967104	valid_1's bin

In [4]:
# 5折交叉验证
def lgb_5_fold(X, y, best_params, n_folds=5):
    best_params.update({
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'verbose': -1,
        'n_estimators': 10000,  # 和贝叶斯优化期间保持一致
        'early_stopping_round': 20
    })

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2023)
    fold_scores = []
    final_predictions = np.zeros(len(X))  # 用于存储每一折的预测结果

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        # 训练模型
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

        model = lgb.train(best_params,
                          lgb_train,
                          valid_sets=[lgb_train, lgb_valid],
                          feval=lgb_ks_eval,
                          callbacks=[lgb.early_stopping(stopping_rounds=50),
                                     lgb.log_evaluation(10)])

        # 保存模型
        model_path = f"{dir_model}/lgbm_fold_{fold + 1}.bin"
        model.save_model(model_path)
        print(f"Model for fold {fold + 1} saved at {model_path}")

        # 验证集预测
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

        # 计算KS
        score = ks_stat(y_valid, y_pred)
        fold_scores.append(score)

        # 将每一折的预测结果保存到对应的索引位置
        final_predictions[valid_idx] = y_pred

        print(f"Fold {fold + 1} - Score: {score:.4f}")

    # 最终综合预测结果的评分
    overall_score = ks_stat(y, final_predictions)

    print(f"\nOverall Score: {overall_score:.4f}")

    return overall_score

In [5]:
# 执行5折交叉验证
best_score = lgb_5_fold(X, y, best_params)
print(f"Best score: {best_score}")




Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.499481	training's ks: 0.82407	valid_1's binary_logloss: 0.50184	valid_1's ks: 0.818976
[20]	training's binary_logloss: 0.38321	training's ks: 0.889419	valid_1's binary_logloss: 0.387246	valid_1's ks: 0.885849
[30]	training's binary_logloss: 0.299756	training's ks: 0.915371	valid_1's binary_logloss: 0.304142	valid_1's ks: 0.911653
[40]	training's binary_logloss: 0.258261	training's ks: 0.926009	valid_1's binary_logloss: 0.26323	valid_1's ks: 0.921208
[50]	training's binary_logloss: 0.229284	training's ks: 0.932903	valid_1's binary_logloss: 0.23429	valid_1's ks: 0.927511
[60]	training's binary_logloss: 0.209526	training's ks: 0.938295	valid_1's binary_logloss: 0.215004	valid_1's ks: 0.933518
[70]	training's binary_logloss: 0.194578	training's ks: 0.940905	valid_1's binary_logloss: 0.200923	valid_1's ks: 0.936375
[80]	training's binary_logloss: 0.185424	training's ks: 0.942727	valid_1's binary_l



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.508846	training's ks: 0.826038	valid_1's binary_logloss: 0.511172	valid_1's ks: 0.821345
[20]	training's binary_logloss: 0.381318	training's ks: 0.889861	valid_1's binary_logloss: 0.385355	valid_1's ks: 0.888416
[30]	training's binary_logloss: 0.306748	training's ks: 0.912416	valid_1's binary_logloss: 0.31204	valid_1's ks: 0.911856
[40]	training's binary_logloss: 0.258188	training's ks: 0.925244	valid_1's binary_logloss: 0.264391	valid_1's ks: 0.927023
[50]	training's binary_logloss: 0.231835	training's ks: 0.931548	valid_1's binary_logloss: 0.239126	valid_1's ks: 0.92978
[60]	training's binary_logloss: 0.210633	training's ks: 0.936325	valid_1's binary_logloss: 0.218348	valid_1's ks: 0.933227
[70]	training's binary_logloss: 0.197787	training's ks: 0.940092	valid_1's binary_logloss: 0.205861	valid_1's ks: 0.936084
[80]	training's binary_logloss: 0.18597	training's ks: 0.94312	valid_1's binary_



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.506099	training's ks: 0.822495	valid_1's binary_logloss: 0.50771	valid_1's ks: 0.820447
[20]	training's binary_logloss: 0.392159	training's ks: 0.883362	valid_1's binary_logloss: 0.394008	valid_1's ks: 0.880823
[30]	training's binary_logloss: 0.300865	training's ks: 0.917342	valid_1's binary_logloss: 0.303704	valid_1's ks: 0.915197
[40]	training's binary_logloss: 0.256973	training's ks: 0.927068	valid_1's binary_logloss: 0.260787	valid_1's ks: 0.923865
[50]	training's binary_logloss: 0.231194	training's ks: 0.93246	valid_1's binary_logloss: 0.236092	valid_1's ks: 0.928494
[60]	training's binary_logloss: 0.211632	training's ks: 0.937458	valid_1's binary_logloss: 0.21735	valid_1's ks: 0.934995
[70]	training's binary_logloss: 0.196117	training's ks: 0.940635	valid_1's binary_logloss: 0.202792	valid_1's ks: 0.937654
[80]	training's binary_logloss: 0.186067	training's ks: 0.942605	valid_1's binary



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.51032	training's ks: 0.817497	valid_1's binary_logloss: 0.508416	valid_1's ks: 0.822122
[20]	training's binary_logloss: 0.381092	training's ks: 0.891636	valid_1's binary_logloss: 0.379287	valid_1's ks: 0.896188
[30]	training's binary_logloss: 0.307252	training's ks: 0.914215	valid_1's binary_logloss: 0.305225	valid_1's ks: 0.917758
[40]	training's binary_logloss: 0.263285	training's ks: 0.925147	valid_1's binary_logloss: 0.262146	valid_1's ks: 0.9281
[50]	training's binary_logloss: 0.236256	training's ks: 0.931992	valid_1's binary_logloss: 0.23616	valid_1's ks: 0.933616
[60]	training's binary_logloss: 0.212382	training's ks: 0.93704	valid_1's binary_logloss: 0.213158	valid_1's ks: 0.939427
[70]	training's binary_logloss: 0.197519	training's ks: 0.940684	valid_1's binary_logloss: 0.199441	valid_1's ks: 0.941101
[80]	training's binary_logloss: 0.185561	training's ks: 0.943393	valid_1's binary_l



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.509617	training's ks: 0.819787	valid_1's binary_logloss: 0.510997	valid_1's ks: 0.815621
[20]	training's binary_logloss: 0.383188	training's ks: 0.886342	valid_1's binary_logloss: 0.386306	valid_1's ks: 0.883286
[30]	training's binary_logloss: 0.305895	training's ks: 0.911531	valid_1's binary_logloss: 0.30945	valid_1's ks: 0.908697
[40]	training's binary_logloss: 0.26102	training's ks: 0.924975	valid_1's binary_logloss: 0.265537	valid_1's ks: 0.921206
[50]	training's binary_logloss: 0.232969	training's ks: 0.932017	valid_1's binary_logloss: 0.237975	valid_1's ks: 0.928199
[60]	training's binary_logloss: 0.214334	training's ks: 0.936178	valid_1's binary_logloss: 0.220197	valid_1's ks: 0.933813
[70]	training's binary_logloss: 0.198946	training's ks: 0.939034	valid_1's binary_logloss: 0.205345	valid_1's ks: 0.937063
[80]	training's binary_logloss: 0.184281	training's ks: 0.942974	valid_1's binar

In [9]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

def find_optimal_seed(X, y, param_space, n_folds=5, seed_list=[0, 42, 100, 2023]):
    best_seed = None
    lowest_std = float('inf')
    seed_scores = {}

    for seed in seed_list:
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        fold_scores = []

        for train_idx, valid_idx in skf.split(X, y):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

            # 使用与之前一致的参数和模型训练方法
            model = lgb.LGBMClassifier(**param_space)  # 使用参数构建模型
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
            
            y_pred = model.predict_proba(X_valid)[:, 1]
            ks = ks_stat(y_valid, y_pred)
            fold_scores.append(ks)

        # 计算该种子下的折间KS值的标准差
        std_ks = np.std(fold_scores)
        seed_scores[seed] = (np.mean(fold_scores), std_ks)

        if std_ks < lowest_std:
            best_seed = seed
            lowest_std = std_ks

    print(f"Best seed found: {best_seed} with lowest KS std: {lowest_std}")
    return best_seed, seed_scores


optimal_seed, seed_scores = find_optimal_seed(X, y, best_params, n_folds=5, seed_list=[0, 42, 100, 2023])

print(optimal_seed)
print(seed_scores)

NameError: name 'best_params' is not defined

In [6]:
# 测试集预测并生成结果文件
def predict_test_set(test_data, n_folds=5, output_file='upload.csv'):
    test_predictions = np.zeros(len(test_data))

    # 依次加载每一折的模型进行预测
    for fold in range(n_folds):
        model_path = f"{dir_model}/lgbm_fold_{fold + 1}.bin"
        model = lgb.Booster(model_file=model_path)

        # 对测试集进行预测，并将每一折的预测加总
        test_predictions += model.predict(test_data)

    # 平均每一折的预测
    test_predictions /= n_folds

    # 生成预测文件
    submission = pd.DataFrame({'CUST_NO': test_data.index, 'PRED': test_predictions})
    submission.to_csv(output_file, index=False)
    print(f"Test predictions saved to {output_file}")

In [9]:
X_test = pd.read_csv(f'{dir_preprocess}/test.csv')
X_test.set_index('CUST_NO', inplace=True)
predict_test_set(X_test)

Test predictions saved to upload.csv


In [4]:
model_path = f"{dir_model}/lgbm_fold_{0 + 1}.bin"
model = lgb.Booster(model_file=model_path)
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X.fea
# 构建重要性数据框
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

# 筛选重要性较低的特征
low_importance_features = importance_df[importance_df['importance'] < threshold]['feature']

ValueError: All arrays must be of the same length