In [3]:
from main.utils.analysis import *
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [4]:
params = {'data_pth':'../../1/mj2非银版/data/modeling_data.csv',
    'date_colName': 'apply_date',
    'y_colName': 'credit_target',
    'org_colName':'org',
    'data_encode':'utf-8',
    'key_colNames' : ['mobile_org', 'apply_date', 'org']}
data = get_dataset(**params)

"原始数据有741764条, 根据['mobile_org', 'apply_date', 'org']去重且只保留标签列[0,1]的数据"

'去重后数据有741764条'

'credit_target, org被重命名为new_target, new_org; apply_date被格式化为new_date, new_date_ym两列'

In [18]:
def train_epoch_(org, tr_orgidx, val_orgidx, tr_idxs, val_idxs, X_tr, X_val, y_tr, y_val, w_tr, param):
    tr_idx = tr_orgidx.get(org)
    val_idx = val_orgidx.get(org)
    X_tr_, y_tr_, w_tr_ = X_tr.loc[list(tr_idxs-set(tr_idx)), ], y_tr.loc[list(tr_idxs-set(tr_idx)), ], w_tr.loc[list(tr_idxs-set(tr_idx)), ]
    X_val_, y_val_ = X_val.loc[list(val_idxs-set(val_idx)), ], y_val.loc[list(val_idxs-set(val_idx)), ]
    X_oos, y_oos = pd.concat([X_tr.loc[tr_idx, ], X_val.loc[val_idx, ]], axis=0) , pd.concat([y_tr.loc[tr_idx, ], y_val.loc[val_idx, ]], axis=0)
    callbacks = None
    if 'stopping_rounds' in param.keys():
        param.update({'num_iterations': 300})
        callbacks = [lgb.early_stopping(stopping_rounds=param.get('stopping_rounds'))]
    model = lgb.train(
                      param,
                      verbose_eval=0, 
                      train_set = lgb.Dataset(X_tr_, label=y_tr_, weight=w_tr_), 
                      valid_sets = [lgb.Dataset(X_tr_, label=y_tr_), lgb.Dataset(X_val_, label=y_val_)],
                      valid_names = ['train', 'val'],
                      callbacks = callbacks
                     )
    ks_tr, ks_val, ks_oos = _get_ks(model, X_tr_, y_tr_), _get_ks(model, X_val, y_val), _get_ks(model, X_oos, y_oos)
    record = pd.DataFrame({'oos_org':org, 'train_ks':ks_tr, 'val_ks':ks_val, 'oos_ks':ks_oos}, index=['0'])
    return record

def train_epoch(X_tr, X_val, y_tr, y_val, w_tr, tr_orgidx, val_orgidx, param, fobj):
    if fobj is not None:
        param.update({'objective': fobj})
    tr_idxs, val_idxs = set(X_tr.index), set(X_val.index)
    results = pd.DataFrame()
    tasks = [(org, tr_orgidx, val_orgidx, tr_idxs, val_idxs, X_tr, X_val, y_tr, y_val, w_tr, param) for org in tr_orgidx.keys()]
    with Pool(5) as pool:
        records = pool.starmap(train_epoch_, tasks)
    for record in records:
        results = pd.concat([results, record], axis=0)
    return results
    

def gridsearch_params(params, data, max_interations, max_gap, min_ks):
    feas = [v for v in data.columns if data[v].dtype!='O' and v!='new_target']
    tr_orgidx, val_orgidx, val_idx, tr_idx = {}, {}, [], []
    splitter = StratifiedShuffleSplit(n_splits=1, random_state=42, train_size=0.8)
    for org in data.new_org.unique():
        tmp_data = data[data.new_org==org].copy()
        for idx_tr, idx_val in splitter.split(tmp_data[feas], tmp_data['new_target']):
            tr_orgidx[org] = list(idx_tr)
            val_orgidx[org] = list(idx_val)
            val_idx += list(idx_val)
            tr_idx += list(idx_tr)
    data_tr, data_val = data.loc[tr_idx, ], data.loc[val_idx, ]
    X_tr, X_val, y_tr, y_val = data_tr[feas], data_val[feas], data_tr['new_target'], data_val['new_target']
    w_tr = pd.Series(np.ones(X_tr.shape[0]))
    
    good_params = pd.DataFrame()
    sampled_params = list(ParameterSampler(
        params, 
        n_iter=max_interations, 
        random_state=42
    ))
    begin_time_, begin_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), time.time()
    display(f"开始执行时间：{begin_time_}")
    for i, param in enumerate(tqdm.tqdm(sampled_params)):
        records = train_epoch(X_tr, X_val, y_tr, y_val, w_tr, tr_orgidx, val_orgidx, param, None)
        mean_tr_ks = np.mean(records['train_ks'])
        mean_val_ks = np.mean(records['val_ks'])
        mean_oos_ks = (np.sum(records['oos_ks'])-np.min(records['oos_ks'])-np.max(records['oos_ks']))*1.0 / (records.shape[0]-2)
        if np.allclose(records['train_ks'], records['val_ks'], atol=max_gap) and mean_val_ks>=min_ks and mean_oos_ks>=min_ks:
            good_params = pd.concat([good_params, pd.DataFrame({'param': [param], 'mean_tr_ks':mean_tr_ks,
                                                                'mean_val_ks':mean_val_ks, 'mean_oos_ks':mean_oos_ks}, index=['0'])], axis=0)
            display(good_params)
            good_params_ = good_params[good_params.mean_val_ks==np.max(good_params.mean_val_ks)]
            display(f"当前最优参数下train平均ks是{np.round(good_params_['mean_tr_ks'].values, 3)}, val平均ks是{np.round(good_params_['mean_val_ks'].values, 3)}, oos平均ks是{np.round(good_params_['mean_oos_ks'].values, 3)}")
        current_time = time.time()
        display(f"平均每组参数训练耗时：{np.round((current_time-begin_time)*1.0 / ((i+1)*60), 2)}分")
    return good_params

In [19]:
params = {'num_threads': [2], 'num_iterations': np.arange(80, 100, 3),'learning_rate':[0.05],
        'colsample_bytree': [0.6],'max_depth': [4, 5],'max_bin': np.arange(50, 100, 10),'min_child_weight': [25],
        'reg_alpha': [3],'reg_lambda': [1], 'objective':['binary'],
        'metric':['auc']
}
def top_5_lift(pred_, data):
    y = data.get_label()
    pred = 1 / (1 + np.exp(-pred_))
    lift5 = _get_lift(y, pred, 0.05)
    return '5%lift', lift5, True
def top_10_lift(pred_, data):
    y = data.get_label()
    pred = 1 / (1 + np.exp(-pred_))
    lift10 = _get_lift(y, pred, 0.1)
    return '10%lift', lift10, True 
def _get_lift(y, pred, k):
    n_top = int(len(y) * k)
    top_indices = pd.Series(pred).sort_values(ascending=False).head(n_top).index
    return y[top_indices].mean() / y.mean()
def _get_ks(model, X, y):
    pred = model.predict(X)
    ks = toad.metrics.KS(pred, y)
    return ks

In [20]:
good_params = gridsearch_params(params, data, 5, 0.2, 0.15)

'开始执行时间：2025-06-10 09:48:46'



  0%|          | 0/5 [00:00<?, ?it/s][A[A

'平均每组参数训练耗时：1.25分'



 20%|██        | 1/5 [01:15<05:00, 75.16s/it][A[A

Unnamed: 0,param,mean_tr_ks,mean_val_ks,mean_oos_ks
0,"{'reg_lambda': 1, 'reg_alpha': 3, 'objective':...",0.33366,0.278506,0.265365


'当前最优参数下train平均ks是[0.334], val平均ks是[0.279], oos平均ks是[0.265]'

'平均每组参数训练耗时：1.22分'



 40%|████      | 2/5 [02:26<03:42, 74.14s/it][A[A

'平均每组参数训练耗时：1.23分'



 60%|██████    | 3/5 [03:41<02:28, 74.21s/it][A[A

Unnamed: 0,param,mean_tr_ks,mean_val_ks,mean_oos_ks
0,"{'reg_lambda': 1, 'reg_alpha': 3, 'objective':...",0.33366,0.278506,0.265365
0,"{'reg_lambda': 1, 'reg_alpha': 3, 'objective':...",0.341875,0.281628,0.267177


'当前最优参数下train平均ks是[0.342], val平均ks是[0.282], oos平均ks是[0.267]'

'平均每组参数训练耗时：1.24分'



 80%|████████  | 4/5 [04:56<01:14, 74.64s/it][A[A

'平均每组参数训练耗时：1.24分'



100%|██████████| 5/5 [06:13<00:00, 75.09s/it][A[A

[A[A