In [3]:
from main.utils.analysis import *
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
params = {'data_pth':'../../1/mj2非银版/data/modeling_data.csv',
    'date_colName': 'apply_date',
    'y_colName': 'credit_target',
    'org_colName':'org',
    'data_encode':'utf-8',
    'key_colNames' : ['mobile_org', 'apply_date', 'org']}
data = get_dataset(**params)

In [None]:
def top_5_lift(pred_, data):
    y = data.get_label()
    pred = 1 / (1 + np.exp(-pred_))
    lift5 = _get_lift(y, pred, 0.05)
    return '5%lift', lift5, True
def top_10_lift(pred_, data):
    y = data.get_label()
    pred = 1 / (1 + np.exp(-pred_))
    lift10 = _get_lift(y, pred, 0.1)
    return '10%lift', lift10, True 
def _get_lift(y, pred, k):
    n_top = int(len(y) * k)
    top_indices = pd.Series(pred).sort_values(ascending=False).head(n_top).index
    return y[top_indices].mean() / y.mean()
def _get_ks(model, X, y):
    pred = model.predict(X)
    ks = toad.metrics.KS(pred, y)
    return ks

class Inference(object):
    '''
        init: data, params, fobj, max_iteration
        funcs: 
            
        return Trails
    '''
    def __init__(self, **kwargs):
        self.data = kwargs.get('data')
        self.weight = kwargs.get('weight')
        self.param = kwargs.get('param')
        self.model = None
    
    # 单机构做oos剩余机构做train查看oos效果
    def refit(self):
        feas = [v for v in data.columns if data[v].dtype!='O' and v!='new_target']
        results = pd.DataFrame()
        for org in data.new_org.unique():
            data_tr = data[~data.new_org.isin([org])].copy()
            data_oos = data[data.new_org==org].copy()
            X_tr, y_tr, w_tr = data_tr[feas], data_tr['new_target'], self.weight.loc[data_tr.index]
            X_oos, y_oos = data_oos[feas], data_oos['new_target']
            early_stopping_rounds = None
            # 保持与参数寻优下使用早停一致，用同样的参数得到
            if 'stopping_rounds' in self.param.keys():
                self.param.update({'num_iterations': 300})
                early_stopping_rounds = self.param.pop('stopping_rounds')
            self.model = lgb.LGBMClassifier.fit(**self.param)
            # 仅监控lift在oos上的变化，不添加训练集的原因是eval_sample_weight现在为None，如果添加训练集应该加上weight计算与oos冲突
            self.model.fit(
                            X_tr, y_tr, 
                            eval_set=[(X_oos, y_oos)],
                            eval_metric=[top_5_lift, top_10_lift]
                            early_stopping_rounds=early_stopping_rounds,
                            verbose=-1
                           )
            ks_tr, ks_oos = _get_ks(self.model, X_tr_, y_tr_), _get_ks(self.model, X_oos, y_oos)
        record = pd.DataFrame({'oos_org':org, 'train_ks':ks_tr, 'val_ks':ks_val, 'oos_ks':ks_oos}, index=['0'])
        return record

    # 自定义目标函数，当参数符合要求时进一步更新超参数空间寻优
    def objective(self, param):
        begin_time = time.time()
        if self.fobj is not None:
            param.update({'objective': self.fobj})
        results = pd.DataFrame()
        # 开启9个进程池运行lgb
        tasks = [(org, param) for org in self.tr_orgidx.keys()]
        with Pool(9) as pool:
            records = pool.starmap(self.train_epoch_, tasks)
        for record in records:
            results = pd.concat([results, record], axis=0)
        
        mean_tr_ks = np.mean(results['train_ks'])
        mean_val_ks = np.mean(results['val_ks'])
        mean_oos_ks = (np.sum(results['oos_ks'])-np.min(results['oos_ks'])-np.max(results['oos_ks']))*1.0 / (results.shape[0]-2)
        # 判断参数符合要求条件为每个机构做oos时的训练集和验证集ks差距在相对3%以下，否则不更新loss
        if np.allclose(results['train_ks'], results['val_ks'], rtol=3e-2):
            loss = -(0.5*mean_val_ks + 0.5*mean_oos_ks)
            status = STATUS_OK
        else:
            loss = np.Inf
            status = STATUS_FAIL
        end_time = time.time()
        display(f"当前组参数训练耗时：{np.round((end_time-begin_time)*1.0/60, 2)}分")
        return {'loss': loss, 'param':param, 'status':status}
    
    # 该类的执行函数，返回trails
    def tpesearch_params(self):
        begin_time_ = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        display(f"开始执行时间：{begin_time_}")
        _ = fmin(fn=self.objective, space=self.params, algo=tpe.suggest, max_evals=self.max_iterations, trials=self.trails)
        return pd.DataFrame(self.trails)

In [7]:
a=  {'a':2, 'b':1}
a.pop("b")
a

{'a': 2}

In [None]:
params = {'num_threads': 3, 
          'num_iterations': scope.int(hp.quniform('num_iterations',70, 150, 5)), 
          'learning_rate':hp.quniform('learning_rate', 0.01, 0.05, 0.01),
          'colsample_bytree':hp.quniform('colsample_bytree',0.5, 0.9, 0.1),
          'max_depth': scope.int(hp.quniform('max_depth',3, 7, 1)), 
          'max_bin': scope.int(hp.quniform('max_bin',50, 150, 10)), 
          'min_child_weight': scope.int(hp.quniform('min_child_weight', 10, 30, 5)),
          'reg_alpha': hp.quniform('reg_alpha', 1, 10, 1),
          'reg_lambda': hp.quniform('reg_lambda', 1, 10, 1),
          'objective':'binary', 
          'metric':'auc'
}
kwargs = {'data':data, 'params': params, 'fobj':None, 'max_iterations': 200}
optlgb = HyperOptLGB(**kwargs)
trails = optlgb.tpesearch_params()