# 模型训练/调参/预测

In [2]:
from constant import *
from sklearn.metrics import roc_curve
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
import lightgbm as lgb
from data import loader,exporter
import numpy as np
import pandas as pd

In [23]:
def ks_stat(y_true, y_pred):
    """计算KS值的自定义评估函数"""
    fpr, tpr, _ = roc_curve(y_true, y_pred)  # 计算ROC曲线
    ks_value = np.max(np.abs(tpr - fpr))  # KS统计量
    return ks_value


def lgb_ks_eval(y_pred, dataset):
    """用于LightGBM的自定义KS评估函数"""
    y_true = dataset.get_label()
    ks_value = ks_stat(y_true, y_pred)
    # 返回 (名称, 计算的 KS 值, 是否越高越好)
    return 'ks', ks_value, True


# 自定义目标函数，用于贝叶斯优化
def objective(params, X, y, n_folds=5):
    """贝叶斯优化的目标函数，返回负的整体验证集KS分数"""
    # 设置模型参数
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'binary'
    params['verbose'] = -1
    params['early_stopping_round'] = 20
    params['n_estimators'] = 10000

    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    # 存储每一折的预测结果
    final_predictions = np.zeros(len(X))

    for train_idx, valid_idx in kf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        # 建立LightGBM训练集
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

        # 训练LightGBM模型
        model = lgb.train(params,
                          lgb_train,
                          valid_sets=[lgb_train, lgb_valid],
                          feval=lgb_ks_eval,
                          callbacks=[lgb.early_stopping(stopping_rounds=50),
                                     lgb.log_evaluation(10)])

        # 验证集预测概率
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

        # 将每一折的预测结果填入到相应的索引位置
        final_predictions[valid_idx] = y_pred

    # 计算整体的KS分数
    overall_score = ks_stat(y, final_predictions)

    # 返回负的KS分数作为最小化目标
    return {'loss': -overall_score, 'status': STATUS_OK}


# 超参数空间
param_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),  # 学习率
    'num_leaves': hp.choice('num_leaves', np.arange(20, 150, dtype=int)),  # 叶子数
    'max_depth': hp.choice('max_depth', np.arange(3, 12, dtype=int)),  # 树的最大深度
    'min_child_weight': hp.uniform('min_child_weight', 0.001, 10),  # 子叶节点的最小权重
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),  # 样本列采样率
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 20),
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # 样本采样率
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),  # L1正则化
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0)  # L2正则化
}


# 贝叶斯优化调参函数
def bayesian_optimize_lgbm(X, y, param_space, max_evals=50):
    trials = Trials()

    # 使用fmin函数进行贝叶斯优化
    best_params = fmin(
        fn=lambda params: objective(params, X, y),  # 优化目标
        space=param_space,  # 参数空间
        algo=tpe.suggest,  # 使用TPE算法
        max_evals=max_evals,  # 最大评估次数
        trials=trials  # 记录每次评估的结果
    )

    return best_params, trials

In [24]:
# 读取数据
df_target = loader.to_concat_df('TARGET')
df_flat = pd.read_csv(f'{dir_preprocess}/df_flat.csv')
df_flat = df_flat.merge(df_target, left_on=['CUST_NO', 'is_train'], right_on=['CUST_NO', 'is_train'], how='inner')
df_flat.drop(columns=['DATA_DAT', 'CARD_NO', 'CUST_NO'], inplace=True)

X = df_flat[df_flat['is_train'] == 1]
y = X.pop("FLAG")  # 目标标签列

# 执行贝叶斯优化
best_params, trials = bayesian_optimize_lgbm(X, y, param_space, max_evals=10)

print("Best parameters found:", best_params)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.132047	training's ks: 0.823144	valid_1's binary_logloss: 0.18094	valid_1's ks: 0.519395
[20]	training's binary_logloss: 0.118563	training's ks: 0.870877	valid_1's binary_logloss: 0.182574	valid_1's ks: 0.535411
  0%|          | 0/10 [00:01<?, ?trial/s, best loss=?]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.132221	training's ks: 0.828503	valid_1's binary_logloss: 0.17836	valid_1's ks: 0.5334
[20]	training's binary_logloss: 0.116955	training's ks: 0.875865	valid_1's binary_logloss: 0.179604	valid_1's ks: 0.529141
  0%|          | 0/10 [00:03<?, ?trial/s, best loss=?]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.132483	training's ks: 0.830632	valid_1's binary_logloss: 0.185361	valid_1's ks: 0.517343
[20]	training's binary_logloss: 0.119615	training's ks: 0.87506	valid_1's binary_logloss: 0.188621	valid_1's ks: 0.522084
  0%|          | 0/10 [00:05<?, ?trial/s, best loss=?]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.131408	training's ks: 0.843748	valid_1's binary_logloss: 0.177092	valid_1's ks: 0.515083
[20]	training's binary_logloss: 0.115308	training's ks: 0.891091	valid_1's binary_logloss: 0.17876	valid_1's ks: 0.517065
  0%|          | 0/10 [00:07<?, ?trial/s, best loss=?]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.132643	training's ks: 0.825433	valid_1's binary_logloss: 0.182057	valid_1's ks: 0.518585
[20]	training's binary_logloss: 0.115517	training's ks: 0.878888	valid_1's binary_logloss: 0.182167	valid_1's ks: 0.516354
 10%|█         | 1/10 [00:09<01:24,  9.41s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.214487	training's ks: 0.584879	valid_1's binary_logloss: 0.219901	valid_1's ks: 0.551151
[20]	training's binary_logloss: 0.236247	training's ks: 0.6077	valid_1's binary_logloss: 0.244612	valid_1's ks: 0.558739
 10%|█         | 1/10 [00:10<01:24,  9.41s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.213774	training's ks: 0.588587	valid_1's binary_logloss: 0.21841	valid_1's ks: 0.536705
[20]	training's binary_logloss: 0.235259	training's ks: 0.608877	valid_1's binary_logloss: 0.242082	valid_1's ks: 0.549187
 10%|█         | 1/10 [00:11<01:24,  9.41s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.213713	training's ks: 0.593431	valid_1's binary_logloss: 0.228529	valid_1's ks: 0.513955
[20]	training's binary_logloss: 0.235532	training's ks: 0.615408	valid_1's binary_logloss: 0.255646	valid_1's ks: 0.524801
 10%|█         | 1/10 [00:12<01:24,  9.41s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.214983	training's ks: 0.586756	valid_1's binary_logloss: 0.219284	valid_1's ks: 0.556798
[20]	training's binary_logloss: 0.236366	training's ks: 0.616032	valid_1's binary_logloss: 0.242441	valid_1's ks: 0.564197
 10%|█         | 1/10 [00:13<01:24,  9.41s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.213975	training's ks: 0.59436	valid_1's binary_logloss: 0.224471	valid_1's ks: 0.523494
[20]	training's binary_logloss: 0.235453	training's ks: 0.615633	valid_1's binary_logloss: 0.24935	valid_1's ks: 0.53366
 20%|██        | 2/10 [00:14<00:54,  6.82s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.179666	training's ks: 0.745319	valid_1's binary_logloss: 0.202891	valid_1's ks: 0.537794
[20]	training's binary_logloss: 0.188817	training's ks: 0.807124	valid_1's binary_logloss: 0.222868	valid_1's ks: 0.5232
 20%|██        | 2/10 [00:15<00:54,  6.82s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.1795	training's ks: 0.753049	valid_1's binary_logloss: 0.201718	valid_1's ks: 0.530089
[20]	training's binary_logloss: 0.189983	training's ks: 0.798358	valid_1's binary_logloss: 0.22231	valid_1's ks: 0.5414
 20%|██        | 2/10 [00:17<00:54,  6.82s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.179907	training's ks: 0.742088	valid_1's binary_logloss: 0.208456	valid_1's ks: 0.52553
[20]	training's binary_logloss: 0.190993	training's ks: 0.790324	valid_1's binary_logloss: 0.233555	valid_1's ks: 0.514419
 20%|██        | 2/10 [00:19<00:54,  6.82s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.177573	training's ks: 0.761422	valid_1's binary_logloss: 0.199219	valid_1's ks: 0.530947
[20]	training's binary_logloss: 0.187181	training's ks: 0.811721	valid_1's binary_logloss: 0.217727	valid_1's ks: 0.543281
 20%|██        | 2/10 [00:20<00:54,  6.82s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.177582	training's ks: 0.759198	valid_1's binary_logloss: 0.20436	valid_1's ks: 0.508658
[20]	training's binary_logloss: 0.188182	training's ks: 0.804734	valid_1's binary_logloss: 0.225835	valid_1's ks: 0.51336
 30%|███       | 3/10 [00:22<00:52,  7.44s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.145735	training's ks: 0.630688	valid_1's binary_logloss: 0.161469	valid_1's ks: 0.557245
[20]	training's binary_logloss: 0.13431	training's ks: 0.668243	valid_1's binary_logloss: 0.15909	valid_1's ks: 0.567803
[30]	training's binary_logloss: 0.128396	training's ks: 0.695817	valid_1's binary_logloss: 0.159546	valid_1's ks: 0.564373
[40]	training's binary_logloss: 0.123459	training's ks: 0.718915	valid_1's binary_logloss: 0.160369	valid_1's ks: 0.561268
 30%|███       | 3/10 [00:24<00:52,  7.44s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.146759	training's ks: 0.633344	valid_1's binary_logloss: 0.162049	valid_1's ks: 0.551392
[20]	training's binary_logloss: 0.134101	training's ks: 0.67123	valid_1's binary_logloss: 0.159306	valid_1's ks: 0.565038
[30]	training's binary_logloss: 0.129273	training's ks: 0.693778	valid_1's binary_logloss: 0.159116	valid_1's ks: 0.561097
[40]	training's binary_logloss: 0.123991	training's ks: 0.722494	valid_1's binary_logloss: 0.15977	valid_1's ks: 0.557533
 30%|███       | 3/10 [00:25<00:52,  7.44s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.145675	training's ks: 0.638421	valid_1's binary_logloss: 0.166092	valid_1's ks: 0.52823
[20]	training's binary_logloss: 0.134454	training's ks: 0.676123	valid_1's binary_logloss: 0.16398	valid_1's ks: 0.539552
[30]	training's binary_logloss: 0.128136	training's ks: 0.703323	valid_1's binary_logloss: 0.163908	valid_1's ks: 0.540069
[40]	training's binary_logloss: 0.122436	training's ks: 0.734254	valid_1's binary_logloss: 0.164075	valid_1's ks: 0.539428
 30%|███       | 3/10 [00:27<00:52,  7.44s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.146999	training's ks: 0.639131	valid_1's binary_logloss: 0.161781	valid_1's ks: 0.534029
[20]	training's binary_logloss: 0.134896	training's ks: 0.680393	valid_1's binary_logloss: 0.159936	valid_1's ks: 0.541416
[30]	training's binary_logloss: 0.12814	training's ks: 0.711198	valid_1's binary_logloss: 0.159884	valid_1's ks: 0.547957
[40]	training's binary_logloss: 0.122375	training's ks: 0.734756	valid_1's binary_logloss: 0.160984	valid_1's ks: 0.549657
 30%|███       | 3/10 [00:29<00:52,  7.44s/trial, best loss: -0.5052191623951107]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                     
[10]	training's binary_logloss: 0.145421	training's ks: 0.640798	valid_1's binary_logloss: 0.164646	valid_1's ks: 0.527926
[20]	training's binary_logloss: 0.134623	training's ks: 0.67468	valid_1's binary_logloss: 0.161951	valid_1's ks: 0.537943
[30]	training's binary_logloss: 0.12688	training's ks: 0.710286	valid_1's binary_logloss: 0.161947	valid_1's ks: 0.545312
[40]	training's binary_logloss: 0.122603	training's ks: 0.725667	valid_1's binary_logloss: 0.161729	valid_1's ks: 0.548307
 40%|████      | 4/10 [00:31<00:47,  7.88s/trial, best loss: -0.542467198507184] 

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.166138	training's ks: 0.81367	valid_1's binary_logloss: 0.202969	valid_1's ks: 0.544781
[20]	training's binary_logloss: 0.159156	training's ks: 0.87561	valid_1's binary_logloss: 0.212896	valid_1's ks: 0.54234
 40%|████      | 4/10 [00:32<00:47,  7.88s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.168953	training's ks: 0.811478	valid_1's binary_logloss: 0.203667	valid_1's ks: 0.537082
[20]	training's binary_logloss: 0.166984	training's ks: 0.859887	valid_1's binary_logloss: 0.214325	valid_1's ks: 0.548263
 40%|████      | 4/10 [00:34<00:47,  7.88s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.169376	training's ks: 0.801221	valid_1's binary_logloss: 0.213521	valid_1's ks: 0.508844
[20]	training's binary_logloss: 0.165729	training's ks: 0.848827	valid_1's binary_logloss: 0.227123	valid_1's ks: 0.516689
 40%|████      | 4/10 [00:36<00:47,  7.88s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.1683	training's ks: 0.8094	valid_1's binary_logloss: 0.205899	valid_1's ks: 0.506638
[20]	training's binary_logloss: 0.163045	training's ks: 0.861196	valid_1's binary_logloss: 0.215556	valid_1's ks: 0.508997
 40%|████      | 4/10 [00:37<00:47,  7.88s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.168506	training's ks: 0.798316	valid_1's binary_logloss: 0.209434	valid_1's ks: 0.509315
[20]	training's binary_logloss: 0.16108	training's ks: 0.862474	valid_1's binary_logloss: 0.218638	valid_1's ks: 0.518
 50%|█████     | 5/10 [00:39<00:40,  8.16s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.197399	training's ks: 0.687967	valid_1's binary_logloss: 0.213139	valid_1's ks: 0.546858
[20]	training's binary_logloss: 0.219092	training's ks: 0.731082	valid_1's binary_logloss: 0.24178	valid_1's ks: 0.548781
 50%|█████     | 5/10 [00:41<00:40,  8.16s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.195379	training's ks: 0.701413	valid_1's binary_logloss: 0.212213	valid_1's ks: 0.514367
[20]	training's binary_logloss: 0.216103	training's ks: 0.731382	valid_1's binary_logloss: 0.239227	valid_1's ks: 0.525929
 50%|█████     | 5/10 [00:42<00:40,  8.16s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.196292	training's ks: 0.685995	valid_1's binary_logloss: 0.219764	valid_1's ks: 0.519642
[20]	training's binary_logloss: 0.218967	training's ks: 0.713187	valid_1's binary_logloss: 0.252203	valid_1's ks: 0.522525
 50%|█████     | 5/10 [00:44<00:40,  8.16s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.197402	training's ks: 0.696626	valid_1's binary_logloss: 0.212788	valid_1's ks: 0.534346
[20]	training's binary_logloss: 0.218627	training's ks: 0.726613	valid_1's binary_logloss: 0.240173	valid_1's ks: 0.542634
 50%|█████     | 5/10 [00:45<00:40,  8.16s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.195856	training's ks: 0.698309	valid_1's binary_logloss: 0.216839	valid_1's ks: 0.512175
[20]	training's binary_logloss: 0.216064	training's ks: 0.739069	valid_1's binary_logloss: 0.244895	valid_1's ks: 0.51806
 60%|██████    | 6/10 [00:47<00:31,  7.94s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.200225	training's ks: 0.694173	valid_1's binary_logloss: 0.215864	valid_1's ks: 0.547375
[20]	training's binary_logloss: 0.225007	training's ks: 0.736295	valid_1's binary_logloss: 0.246967	valid_1's ks: 0.54924
 60%|██████    | 6/10 [00:48<00:31,  7.94s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.200602	training's ks: 0.697095	valid_1's binary_logloss: 0.214688	valid_1's ks: 0.535106
[20]	training's binary_logloss: 0.224942	training's ks: 0.739759	valid_1's binary_logloss: 0.245224	valid_1's ks: 0.545287
 60%|██████    | 6/10 [00:50<00:31,  7.94s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.200731	training's ks: 0.696456	valid_1's binary_logloss: 0.222905	valid_1's ks: 0.505515
[20]	training's binary_logloss: 0.226129	training's ks: 0.737279	valid_1's binary_logloss: 0.258569	valid_1's ks: 0.518196
 60%|██████    | 6/10 [00:51<00:31,  7.94s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.200395	training's ks: 0.705183	valid_1's binary_logloss: 0.213869	valid_1's ks: 0.539488
[20]	training's binary_logloss: 0.225425	training's ks: 0.742098	valid_1's binary_logloss: 0.243482	valid_1's ks: 0.56208
 60%|██████    | 6/10 [00:52<00:31,  7.94s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.199856	training's ks: 0.696936	valid_1's binary_logloss: 0.218858	valid_1's ks: 0.514091
[20]	training's binary_logloss: 0.223897	training's ks: 0.745992	valid_1's binary_logloss: 0.250602	valid_1's ks: 0.533917
 70%|███████   | 7/10 [00:54<00:23,  7.67s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.173179	training's ks: 0.724316	valid_1's binary_logloss: 0.185529	valid_1's ks: 0.531442
[20]	training's binary_logloss: 0.160345	training's ks: 0.7684	valid_1's binary_logloss: 0.179973	valid_1's ks: 0.5404
[30]	training's binary_logloss: 0.153506	training's ks: 0.798752	valid_1's binary_logloss: 0.17893	valid_1's ks: 0.539864
[40]	training's binary_logloss: 0.14911	training's ks: 0.821063	valid_1's binary_logloss: 0.179323	valid_1's ks: 0.544023
[50]	training's binary_logloss: 0.146134	training's ks: 0.838503	valid_1's binary_logloss: 0.18032	valid_1's ks: 0.547652
 70%|███████   | 7/10 [00:58<00:23,  7.67s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.173289	training's ks: 0.732214	valid_1's binary_logloss: 0.18537	valid_1's ks: 0.529088
[20]	training's binary_logloss: 0.160554	training's ks: 0.765476	valid_1's binary_logloss: 0.179931	valid_1's ks: 0.54384
[30]	training's binary_logloss: 0.153775	training's ks: 0.791911	valid_1's binary_logloss: 0.178557	valid_1's ks: 0.549639
[40]	training's binary_logloss: 0.149482	training's ks: 0.813081	valid_1's binary_logloss: 0.179049	valid_1's ks: 0.550139
 70%|███████   | 7/10 [01:01<00:23,  7.67s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.172912	training's ks: 0.73351	valid_1's binary_logloss: 0.18625	valid_1's ks: 0.493803
[20]	training's binary_logloss: 0.159902	training's ks: 0.773911	valid_1's binary_logloss: 0.181797	valid_1's ks: 0.52563
[30]	training's binary_logloss: 0.153124	training's ks: 0.805394	valid_1's binary_logloss: 0.181906	valid_1's ks: 0.51906
[40]	training's binary_logloss: 0.148937	training's ks: 0.81874	valid_1's binary_logloss: 0.183204	valid_1's ks: 0.52466
 70%|███████   | 7/10 [01:05<00:23,  7.67s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.173115	training's ks: 0.74297	valid_1's binary_logloss: 0.185065	valid_1's ks: 0.513378
[20]	training's binary_logloss: 0.160661	training's ks: 0.779867	valid_1's binary_logloss: 0.179688	valid_1's ks: 0.519583
[30]	training's binary_logloss: 0.154028	training's ks: 0.807658	valid_1's binary_logloss: 0.178534	valid_1's ks: 0.523595
[40]	training's binary_logloss: 0.150159	training's ks: 0.825411	valid_1's binary_logloss: 0.178508	valid_1's ks: 0.531812
[50]	training's binary_logloss: 0.147486	training's ks: 0.837393	valid_1's binary_logloss: 0.179573	valid_1's ks: 0.531829
 70%|███████   | 7/10 [01:10<00:23,  7.67s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.172718	training's ks: 0.730949	valid_1's binary_logloss: 0.186319	valid_1's ks: 0.514285
[20]	training's binary_logloss: 0.159798	training's ks: 0.77423	valid_1's binary_logloss: 0.18133	valid_1's ks: 0.521653
[30]	training's binary_logloss: 0.153063	training's ks: 0.803397	valid_1's binary_logloss: 0.180586	valid_1's ks: 0.528153
[40]	training's binary_logloss: 0.148794	training's ks: 0.819422	valid_1's binary_logloss: 0.181428	valid_1's ks: 0.527373
 80%|████████  | 8/10 [01:14<00:23, 11.64s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.174111	training's ks: 0.824853	valid_1's binary_logloss: 0.219822	valid_1's ks: 0.528871
[20]	training's binary_logloss: 0.155647	training's ks: 0.874313	valid_1's binary_logloss: 0.218351	valid_1's ks: 0.528818
 80%|████████  | 8/10 [01:16<00:23, 11.64s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.173087	training's ks: 0.821479	valid_1's binary_logloss: 0.220145	valid_1's ks: 0.501784
[20]	training's binary_logloss: 0.155666	training's ks: 0.879054	valid_1's binary_logloss: 0.219198	valid_1's ks: 0.515095
 80%|████████  | 8/10 [01:18<00:23, 11.64s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.169749	training's ks: 0.835833	valid_1's binary_logloss: 0.225045	valid_1's ks: 0.51045
[20]	training's binary_logloss: 0.155883	training's ks: 0.867451	valid_1's binary_logloss: 0.228367	valid_1's ks: 0.510167
 80%|████████  | 8/10 [01:20<00:23, 11.64s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.17056	training's ks: 0.825311	valid_1's binary_logloss: 0.215565	valid_1's ks: 0.516584
[20]	training's binary_logloss: 0.150152	training's ks: 0.888252	valid_1's binary_logloss: 0.213688	valid_1's ks: 0.50942
 80%|████████  | 8/10 [01:22<00:23, 11.64s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.170036	training's ks: 0.830834	valid_1's binary_logloss: 0.219166	valid_1's ks: 0.517535
[20]	training's binary_logloss: 0.152126	training's ks: 0.887679	valid_1's binary_logloss: 0.214403	valid_1's ks: 0.541163
 90%|█████████ | 9/10 [01:24<00:11, 11.13s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.23863	training's ks: 0.632269	valid_1's binary_logloss: 0.248674	valid_1's ks: 0.563609
[20]	training's binary_logloss: 0.266635	training's ks: 0.681848	valid_1's binary_logloss: 0.282418	valid_1's ks: 0.56228
 90%|█████████ | 9/10 [01:25<00:11, 11.13s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.238384	training's ks: 0.642321	valid_1's binary_logloss: 0.24722	valid_1's ks: 0.531948
[20]	training's binary_logloss: 0.265809	training's ks: 0.686041	valid_1's binary_logloss: 0.280148	valid_1's ks: 0.54327
 90%|█████████ | 9/10 [01:27<00:11, 11.13s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.238627	training's ks: 0.634176	valid_1's binary_logloss: 0.257827	valid_1's ks: 0.518107
[20]	training's binary_logloss: 0.266602	training's ks: 0.678519	valid_1's binary_logloss: 0.295284	valid_1's ks: 0.514949
 90%|█████████ | 9/10 [01:28<00:11, 11.13s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.240618	training's ks: 0.638914	valid_1's binary_logloss: 0.248452	valid_1's ks: 0.534923
[20]	training's binary_logloss: 0.26789	training's ks: 0.682527	valid_1's binary_logloss: 0.279088	valid_1's ks: 0.546545
 90%|█████████ | 9/10 [01:29<00:11, 11.13s/trial, best loss: -0.542467198507184]

Found `n_estimators` in params. Will use it instead of argument



Training until validation scores don't improve for 50 rounds                    
[10]	training's binary_logloss: 0.238619	training's ks: 0.63739	valid_1's binary_logloss: 0.254265	valid_1's ks: 0.5236
[20]	training's binary_logloss: 0.266104	training's ks: 0.683943	valid_1's binary_logloss: 0.288466	valid_1's ks: 0.540821
100%|██████████| 10/10 [01:31<00:00,  9.11s/trial, best loss: -0.542467198507184]
Best parameters found: {'colsample_bytree': 0.6983157656158572, 'learning_rate': 0.1705732511631721, 'max_depth': 4, 'min_child_weight': 9.775174703305787, 'num_leaves': 23, 'reg_alpha': 0.6014275790836761, 'reg_lambda': 0.30676387849859477, 'scale_pos_weight': 1.0099553852611183, 'subsample': 0.8925238455520927}


In [25]:
# 5折交叉验证
def lgb_5_fold(X, y, best_params, n_folds=5):
    best_params.update({
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'verbose': -1,
        'n_estimators': 10000,  # 和贝叶斯优化期间保持一致
        'early_stopping_round': 20
    })

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
    fold_scores = []
    final_predictions = np.zeros(len(X))  # 用于存储每一折的预测结果

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        # 训练模型
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

        model = lgb.train(best_params,
                          lgb_train,
                          valid_sets=[lgb_train, lgb_valid],
                          feval=lgb_ks_eval,
                          callbacks=[lgb.early_stopping(stopping_rounds=50),
                                     lgb.log_evaluation(10)])

        # 保存模型
        model_path = f"{dir_model}/lgbm_fold_{fold + 1}.bin"
        model.save_model(model_path)
        print(f"Model for fold {fold + 1} saved at {model_path}")

        # 验证集预测
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

        # 计算KS
        score = ks_stat(y_valid, y_pred)
        fold_scores.append(score)

        # 将每一折的预测结果保存到对应的索引位置
        final_predictions[valid_idx] = y_pred

        print(f"Fold {fold + 1} - Score: {score:.4f}")

    # 最终综合预测结果的评分
    overall_score = ks_stat(y, final_predictions)

    print(f"\nOverall Score: {overall_score:.4f}")

    return overall_score

In [26]:
# 执行5折交叉验证
best_score = lgb_5_fold(X, y, best_params)
print(f"Best score: {best_score}")


Found `n_estimators` in params. Will use it instead of argument


Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.157059	training's ks: 0.576152	valid_1's binary_logloss: 0.163124	valid_1's ks: 0.561215
[20]	training's binary_logloss: 0.149618	training's ks: 0.595574	valid_1's binary_logloss: 0.15999	valid_1's ks: 0.553792
[30]	training's binary_logloss: 0.145429	training's ks: 0.606795	valid_1's binary_logloss: 0.159353	valid_1's ks: 0.557227
Model for fold 1 saved at /Users/z/Data/Contest/2024/sh/dummy/model/lgbm_fold_1.bin
Fold 1 - Score: 0.5658


Found `n_estimators` in params. Will use it instead of argument


Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.156688	training's ks: 0.577266	valid_1's binary_logloss: 0.163487	valid_1's ks: 0.543723
[20]	training's binary_logloss: 0.14903	training's ks: 0.603021	valid_1's binary_logloss: 0.159818	valid_1's ks: 0.548952
[30]	training's binary_logloss: 0.144621	training's ks: 0.613624	valid_1's binary_logloss: 0.15864	valid_1's ks: 0.553868
[40]	training's binary_logloss: 0.140997	training's ks: 0.6388	valid_1's binary_logloss: 0.158075	valid_1's ks: 0.555845
[50]	training's binary_logloss: 0.138129	training's ks: 0.654463	valid_1's binary_logloss: 0.158351	valid_1's ks: 0.551328
Model for fold 2 saved at /Users/z/Data/Contest/2024/sh/dummy/model/lgbm_fold_2.bin
Fold 2 - Score: 0.5568


Found `n_estimators` in params. Will use it instead of argument


Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.156697	training's ks: 0.583152	valid_1's binary_logloss: 0.166882	valid_1's ks: 0.517907
[20]	training's binary_logloss: 0.149197	training's ks: 0.601976	valid_1's binary_logloss: 0.162898	valid_1's ks: 0.535064
[30]	training's binary_logloss: 0.145573	training's ks: 0.61526	valid_1's binary_logloss: 0.161948	valid_1's ks: 0.542605
[40]	training's binary_logloss: 0.142556	training's ks: 0.628371	valid_1's binary_logloss: 0.16142	valid_1's ks: 0.540941
[50]	training's binary_logloss: 0.139811	training's ks: 0.642123	valid_1's binary_logloss: 0.161107	valid_1's ks: 0.54724
[60]	training's binary_logloss: 0.136972	training's ks: 0.657321	valid_1's binary_logloss: 0.161233	valid_1's ks: 0.547164
[70]	training's binary_logloss: 0.134708	training's ks: 0.661718	valid_1's binary_logloss: 0.161718	valid_1's ks: 0.540082
Model for fold 3 saved at /Users/z/Data/Contest/2024/sh/dummy/model/lgbm_fold_3.b

Found `n_estimators` in params. Will use it instead of argument


Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.157008	training's ks: 0.574982	valid_1's binary_logloss: 0.163026	valid_1's ks: 0.543487
[20]	training's binary_logloss: 0.149493	training's ks: 0.597475	valid_1's binary_logloss: 0.159274	valid_1's ks: 0.552869
[30]	training's binary_logloss: 0.145258	training's ks: 0.609912	valid_1's binary_logloss: 0.158365	valid_1's ks: 0.557344
[40]	training's binary_logloss: 0.141428	training's ks: 0.628169	valid_1's binary_logloss: 0.158344	valid_1's ks: 0.562103
[50]	training's binary_logloss: 0.137799	training's ks: 0.639677	valid_1's binary_logloss: 0.15854	valid_1's ks: 0.562391
[60]	training's binary_logloss: 0.135215	training's ks: 0.654129	valid_1's binary_logloss: 0.158815	valid_1's ks: 0.556127
Model for fold 4 saved at /Users/z/Data/Contest/2024/sh/dummy/model/lgbm_fold_4.bin
Fold 4 - Score: 0.5665


Found `n_estimators` in params. Will use it instead of argument


Training until validation scores don't improve for 50 rounds
[10]	training's binary_logloss: 0.156165	training's ks: 0.567873	valid_1's binary_logloss: 0.166483	valid_1's ks: 0.531536
[20]	training's binary_logloss: 0.148325	training's ks: 0.595583	valid_1's binary_logloss: 0.162978	valid_1's ks: 0.530982
[30]	training's binary_logloss: 0.143799	training's ks: 0.613765	valid_1's binary_logloss: 0.162249	valid_1's ks: 0.535129
[40]	training's binary_logloss: 0.141151	training's ks: 0.621926	valid_1's binary_logloss: 0.162119	valid_1's ks: 0.540969
[50]	training's binary_logloss: 0.138316	training's ks: 0.634735	valid_1's binary_logloss: 0.162403	valid_1's ks: 0.537181
Model for fold 5 saved at /Users/z/Data/Contest/2024/sh/dummy/model/lgbm_fold_5.bin
Fold 5 - Score: 0.5428

Overall Score: 0.5485
Best score: 0.5485360295890676


In [22]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

def find_optimal_seed(X, y, param_space, n_folds=5, seed_list=[0, 42, 100, 2023]):
    best_seed = None
    lowest_std = float('inf')
    seed_scores = {}

    for seed in seed_list:
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        fold_scores = []

        for train_idx, valid_idx in skf.split(X, y):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

            # 使用与之前一致的参数和模型训练方法
            model = lgb.LGBMClassifier(**param_space)  # 使用参数构建模型
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
            
            y_pred = model.predict_proba(X_valid)[:, 1]
            ks = ks_stat(y_valid, y_pred)
            fold_scores.append(ks)

        # 计算该种子下的折间KS值的标准差
        std_ks = np.std(fold_scores)
        seed_scores[seed] = (np.mean(fold_scores), std_ks)

        if std_ks < lowest_std:
            best_seed = seed
            lowest_std = std_ks

    print(f"Best seed found: {best_seed} with lowest KS std: {lowest_std}")
    return best_seed, seed_scores


optimal_seed, seed_scores = find_optimal_seed(X, y, best_params, n_folds=5, seed_list=[0, 42, 100, 2023])

print(optimal_seed)
print(seed_scores)

'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
'early_stopping_rounds' argument is depr

Best seed found: 0 with lowest KS std: 0.009036162651410237
0
{0: (0.5519935649537354, 0.009036162651410237), 42: (0.5501777084193054, 0.029194183607442168), 100: (0.5534339207343374, 0.021856339442777126), 2023: (0.5484132032355762, 0.012979645226222662)}


In [27]:
# 测试集预测并生成结果文件
def predict_test_set(test_data, n_folds=5, output_file='upload.csv'):
    test_predictions = np.zeros(len(test_data))

    # 依次加载每一折的模型进行预测
    for fold in range(n_folds):
        model_path = f"{dir_model}/lgbm_dart_fold_{fold + 1}.bin"
        model = lgb.Booster(model_file=model_path)

        # 对测试集进行预测，并将每一折的预测加总
        test_predictions += model.predict(test_data)

    # 平均每一折的预测
    test_predictions /= n_folds

    # 生成预测文件
    submission = pd.DataFrame({'CUST_NO': test_data.index, 'PRED': test_predictions})
    submission.to_csv(output_file, index=False)
    print(f"Test predictions saved to {output_file}")