In [1]:
%autosave 0

Autosave disabled


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import chi2, chi2_contingency

In [5]:
df = pd.read_csv('datasets/data.csv')
data_train, data_test = train_test_split(df, test_size=0.2)

In [6]:
data_x = df.drop('bad', axis=1)
num_features = list(data_x.dtypes[~(data_x.dtypes == object)].index)
cat_features = list(data_x.dtypes[data_x.dtypes == object].index)

In [7]:
cat_maxvalue_ = 5
max_interval_ = 5  # 分箱后的箱数
IV_threshold_ = 0.1
cor_threshold_ = 0.6
VIF_threshold_ = 5
ANOVA_threshold_ = 0.01
RFE_ratio_ = 0.85
select_model_rario_ = 0.5
pvalues_threshold_ = 0.1

In [9]:
# 检查类别变量中哪些变量取值超过 5（通过cat_maxvalue常量来设置）
more_value_features = []
less_value_features = []
for var in ['f1','f2','f3']:
    value_count = len(set(data_train[var]))
    if value_count > cat_maxvalue_:
        more_value_features.append(var)
    else:
        less_value_features.append(var)

In [13]:
"""
卡方分箱
WOE
IV
"""

# import numpy as np
# import time
# from scipy.stats import chi2, chi2_contingency

def bin_bad_rate(df, col, target):
    """
    计算坏样本率
    :param df: 需要计算好坏比率的数据集
    :param col: 需要计算好坏比率的特征
    :param target: 好坏标签
    :return: 每箱的坏样本率
    """
    total = df.groupby(by=col, sort=True)[target].count()
    total = pd.DataFrame({'total': total})
    bad = df.groupby(by=col, sort=True)[target].sum()
    bad = pd.DataFrame({'bad': bad})
    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
    regroup['good'] = regroup['total'] - regroup['bad']
    regroup.reset_index(level=0, inplace=True)
    regroup['bad_rate'] = regroup['bad'] / regroup['total']
    return regroup


def merge_cat_bin(df, col, target, direction='bad'):
    """
    类别变量合并
    合并逻辑：坏样本率为0时，按照从小到大排序依次合并到下一个类别中直到合并后的类别坏样本率大于0
            坏样本率为1时，按照从大到小排序依次合并到下一个类别中直到合并后的类别坏样本率小于1
    :param df: 包含检验0%或100%的坏样本率的数据集
    :param col: 需要合并的类别变量
    :param target: 目标变量，1表示坏
    :param direction: 用来区别坏样本率为0或1
    :return: 合并结果，使得每个组里同时包含好坏样本
    """
    regroup = bin_bad_rate(df, col, target)
    if direction == 'bad':
        regroup = regroup.sort_values(by='bad_rate')
    else:
        regroup = regroup.sort_values(by='bad_rate', ascending=False)
    regroup_col = [[i] for i in regroup[col]]
    del_index = []
    for i in range(regroup.shape[0]-1):
        regroup_col[i+1] = regroup_col[i] + regroup_col[i+1]
        del_index.append(i)
        if direction == 'bad':
            if regroup['bad_rate'][i+1] > 0:
                break
        else:
            if regroup['bad_rate'] < 1:
                break
    regroup_col2 = [regroup_col[i] for i in range(len(regroup_col)) if i not in del_index]
    new_group = {}
    for i in range(len(regroup_col2)):
        for g2 in regroup_col2[i]:
            new_group[g2] = 'bin' + str(i)
    return new_group


def split_data(df, col, num_split):
    """
    切分逻辑：等频分成100份，再将切割点去重
    :param df: 包含需要待分箱特征的数据集
    :param col: 待分箱的变量
    :param num_split: 切分的组别数
    :return: col中切分点（在原数据集上增加一列，把原始细粒度的col重新划分成粗粒度的值，便于后面分箱的合并处理）
    """
    N = df.shape[0]
    n = int(N / num_split)
    split_point_index = [i * n - 1 for i in range(1, num_split)]
    raw_values = sorted(df[col])
    split_point = [raw_values[i] for i in split_point_index]
    split_point = sorted(set(split_point))
    return split_point


def assign_group(x, bin, value_max):
    """
    分配逻辑：对于在区间(a, b]内的数值，取右侧的值作为这个区间所有值的表征；
            对于大于切分点最大值的，可随意取一个较大的数作为表征
    :param x: 某个变量的某个取值
    :param bin: 上述变量的分箱结果
    :param value_max: 这列数据的最大值
    :return: x在分箱结果下的映射
    """
    N = len(bin)
    if x <= min(bin):
        return min(bin)
    elif x > max(bin):
        return value_max + 1
    else:
        for i in range(N-1):
            if bin[i] < x <= bin[i+1]:
                return bin[i+1]


def chisq(df):
    """
    计算卡方值
    :param df: 包含各类别全部样本总计和各类别坏样本总计的数据框
    :return: 卡方值
    """
    # 经过Yate修正
    bad_rate = sum(df['bad']) / sum(df['total'])
    if bad_rate in [0, 1]:
        return 0
    df['good'] = df['total'] - df['bad']
    good_rate = sum(df['good']) / sum(df['total'])
    df['bad_expected'] = df['total'] * bad_rate
    df['good_expected'] = df['total'] * good_rate
    bad_combined = zip(df['bad_expected'], df['bad'])
    good_combined = zip(df['good_expected'], df['good'])
    bad_chi = [(abs(i[0] - i[1]) - 0.5)**2 / i[0] for i in bad_combined]
    good_chi = [(abs(i[0] - i[1]) - 0.5)**2 / i[0] for i in good_combined]
    chisq_s = sum(bad_chi) + sum(good_chi)
    return chisq_s


def chisq_v2(df):
    # 计算卡方值_v2
    bad_rate = sum(df['bad']) / sum(df['total'])
    if bad_rate in [0, 1]:
        return 0
    else:
        chi = chi2_contingency(df[['bad', 'good']])[0]
        return chi


def assign_bin(x, cut_points, special_attribute=None):
    """
    根据切分点分箱
    :param x: 某个变量的某个取值
    :param cut_points: 上述变量的分箱结果，用切分点表示
    :param special_attribute: 不参与分箱的特殊取值
    :return: 分箱后对应的第几个箱子
    """
    if special_attribute is None:
        special_attribute = []
    num_bin = len(cut_points)
    if x in special_attribute:
        i = special_attribute.index(x) + 1
        return 'Bin_{}'.format(0 - i)
    if x <= cut_points[0]:
        return 'Bin_0'
    elif x > cut_points[-1]:
        return 'Bin_{}'.format(num_bin)
    else:
        for i in range(0, num_bin - 1):
            if cut_points[i] < x <= cut_points[i + 1]:
                return 'Bin_{}'.format(i + 1)


def chi_merge(df, col, target, max_interval=5, special_attribute=None, set_threshold=True):
    """
    卡方分箱
    :param df: 包含目标变量与分箱属性的数据框
    :param col: 需要分箱的属性
    :param target: 目标变量
    :param max_interval: 最大分箱数，如果原始属性的取值个数低于该参数，不执行这段函数
    :param special_attribute: 不参与分箱的取值
    :param set_threshold: 是否设置合并的卡方上限
    :return: 分箱结果
    """
    if special_attribute is None:
        special_attribute = []
    if len(special_attribute) > 0:
        df2 = df[~(df[col].isin(special_attribute))]  # 去掉special_attribute后的df
    else:
        df2 = df.copy()
    col_values = sorted(set(df2[col]))
    n_distinct = len(col_values)
    if n_distinct <= max_interval - len(special_attribute):  # 如果原始属性的取值个数低于max_interval，不执行卡方分箱的逻辑
        cut_points = col_values[:-1]
        df2['temp'] = df2[col]
    else:
        # 1. 将col按等频分成100份，如果col中去重之后的个数小于100直接进行下面的步骤
        if n_distinct > 100:
            split_x = split_data(df2, col, 100)
            col_max = max(df2[col])
            # assign_group函数：返回原值在切分后的映射，经map后，生成该特征初步分箱后的结果
            df2['temp'] = df2[col].map(lambda x: assign_group(x, split_x, col_max))
        else:
            df2['temp'] = df2[col]
        # 计算初步分箱后的bad rate，总体bad rate将被用来计算expected bad count
        regroup = bin_bad_rate(df2, 'temp', target)
        # 首先，每个单独的属性值将被分为单独的一组，然后两两组别进行合并
        group_intervals = [[i] for i in regroup['temp']]

        # 2. 建立循环，不断合并最优的相邻两个组别，直到：
        # （1）最优的两个组别的卡方值大于合并阈值，或最终分裂出来的分箱数<=预设的最大分箱数
        # （2）每箱同时包含好坏样本
        # （3）每箱的占比不低于预设值（可选，这里没设置）
        # 如果有特殊属性，那么最终分裂出来的分箱数 = 设的最大分箱数 - 特殊属性的个数
        split_intervals = max_interval - len(special_attribute)
        # 合并的卡方阈值
        if set_threshold:
            threshold = chi2.isf(0.05, 1)
        else:
            threshold = 999
#         time0 = time.time()
        while len(group_intervals) > split_intervals:
#             time1 = time.time()
            # 每次循环计算合并相邻组别后的卡方值
            chi_list = []
            for k in range(len(group_intervals)-1):
                temp_group = [min(group_intervals[k]), min(group_intervals[k+1])]
                df2g = regroup.loc[regroup['temp'].isin(temp_group)][['total', 'bad', 'good']]
                # chi = chisq(df2g)
                chi = chisq_v2(df2g)
                chi_list.append(chi)
            # time2 = time.time()
            # print(time2 - time1)
            chi_min = min(chi_list)
            if chi_min <= threshold:
                best_combined = chi_list.index(chi_min)
                # 更新group_intervals
                group_intervals[best_combined] = group_intervals[best_combined] + group_intervals[best_combined+1]
                group_intervals.remove(group_intervals[best_combined+1])
                # 更新regroup
                regroup.iloc[best_combined, 1:] = regroup.iloc[best_combined, 1:] + regroup.iloc[best_combined+1, 1:]
                drop_index = regroup.index[best_combined + 1]
                regroup.drop(drop_index, inplace=True)
                # time3 = time.time()
                # print(time3 - time2)
            else:
                break
        # time4 = time.time()
        # print(time4 - time0)
        cut_points = [max(i) for i in group_intervals[:-1]]
    # 检查是否有箱没有好或者坏样本。如果有，需要跟相邻的箱进行合并，直到每箱同时包含好坏样本
    df2['temp_bin'] = df2['temp'].apply(lambda x: assign_bin(x, cut_points))
    regroup = bin_bad_rate(df2, 'temp_bin', target)
    bbr_values = regroup['bad_rate']
    min_bad_rate = min(bbr_values)
    max_bad_rate = max(bbr_values)
    while min_bad_rate == 0 or max_bad_rate == 1:
        # 找出全部为好/坏样本的箱子
        bin_bad_01 = regroup[regroup['bad_rate'].isin([0, 1])].temp_bin.tolist()
        bin0 = bin_bad_01[0]
        bin_max = max([i.split('_')[1] for i in regroup.temp_bin])
        bin_min = min([i.split('_')[1] for i in regroup.temp_bin])
        # 如果是最后一箱, 则需要和上一个箱进行合并，分裂点cut_points中的最后一个需要移除
        if bin0.split('_')[1] == bin_max:
            del cut_points[-1]
        # 如果是第一箱，则需要和下一个箱进行合并, 分裂点cut_points中的第一个需要移除
        elif bin0.split('_')[1] == bin_min:
            del cut_points[0]
        # 如果是中间的某一箱，则需要和前后中的一个箱进行合并，依据是较小的卡方值
        else:
            # 和前一箱进行合并，并计算卡方值
            current_index = list(regroup.temp_bin).index(bin0)
            prev_bin = list(regroup.temp_bin)[current_index - 1]
            df2g = regroup.loc[regroup['temp_bin'].isin([prev_bin, bin0])][['total', 'bad', 'good']]
            chisq1 = chisq_v2(df2g)
            # 和后一箱进行合并，并计算卡方值
            later_bin = list(regroup.temp_bin)[current_index + 1]
            df2g = regroup.loc[regroup['temp_bin'].isin([later_bin, bin0])][['total', 'bad', 'good']]
            chisq2 = chisq_v2(df2g)
            if chisq1 < chisq2:
                cut_points.remove(cut_points[current_index - 1])
            else:
                cut_points.remove(cut_points[current_index])
        # 完成一次合并后，需要再次计算新的分箱准则下，每箱是否同时包含好坏样本
        if len(cut_points) > 0:
            df2['temp_bin'] = df2['temp'].apply(lambda x: assign_bin(x, cut_points))
            regroup = bin_bad_rate(df2, 'temp_bin', target)
            bbr_values = regroup['bad_rate']
            min_bad_rate = min(bbr_values)
            max_bad_rate = max(bbr_values)
        else:
            return special_attribute + [1e6]
    cut_points = special_attribute + cut_points
    return cut_points


def calc_woe_iv(df, col, target):
    """
    计算分箱后的WOE和IV
    :param df: 包含需要计算WOE的变量和目标变量
    :param col: 需要计算WOE、IV的变量，分箱后的变量，或不需要分箱的类别变量
    :param target: 目标变量，1为坏样本
    :return: 返回WOE和IV
    """
    regroup = bin_bad_rate(df, col, target)
    N = sum(regroup['total'])
    B = sum(regroup['bad'])
    G = N - B
    regroup['bad_percent'] = regroup['bad'].map(lambda x: x / B)
    regroup['good_percent'] = regroup['good'].map(lambda x: x / G)
    regroup['WOE'] = regroup.apply(lambda x: np.log(x.bad_percent/x.good_percent), axis=1)
    woe_dict = dict(zip(regroup[col], regroup['WOE']))
    regroup['IV'] = regroup.apply(lambda x: (x.bad_percent - x.good_percent) * x.WOE, axis=1)
    IV = regroup['IV'].sum()
    return woe_dict, IV


def data_test_process(df, cm_dict, bre_dict, com_dict, WOE_dict):
    """
    处理测试集，返回woe编码后的测试集
    :param df: test data
    :param cm_dict: <= 5 的类别特征中存在某个类别只包含好样本或坏样本，合并之后的变量
    :param bre_dict: 大于5的类别特征，bad rate编码
    :param com_dict: 连续变量分箱结果，其中包括bad rate编码之后的类别特征
    :param WOE_dict: woe编码结果
    :return: 经过woe编码之后的test data
    """
    if len(cm_dict) > 0:
        for col in cm_dict.keys():
            bin_var = col + '_bin'
            df[bin_var] = df[col].map(cm_dict[col])
    if len(bre_dict) > 0:
            for col in bre_dict.keys():
                br_var = col + '_br_encoding'
                df[br_var] = df[col].map(bre_dict[col])
    if len(com_dict) > 0:
            for col in com_dict.keys():
                bin_var = col + '_bin'
                if -99999 not in set(df[col]):
                    df[bin_var] = df[col].map(lambda x: assign_bin(x, com_dict[col]))
                else:
                    df[bin_var] = df[col].map(lambda x: assign_bin(x, com_dict[col], special_attribute=[-99999]))
    if len(WOE_dict) > 0:
            for col in WOE_dict.keys():
                WOE_var = col + '_WOE'
                df[WOE_var] = df[col].map(WOE_dict[col])
    return df


def roc_curve(df, target, prob, thresholds):
    """
    计算fpr, tpr
    :param df: 包含目标变量和预测结果的数据集
    :param target: 目标变量
    :param prob: 预测为正样本的概率
    :param thresholds: 由训练数据得出的阈值
    :return: fpr, tpr
    """
    data = df[[target, prob]]
    fpr = []
    tpr = []
    for t in thresholds:
        data['predict_cat'] = data[prob].apply(lambda x: 1 if x > t else 0)
        data['true'] = (data[target] == data['predict_cat']) * 1
        TP = len(data[(data['true'] == 1) & (data['predict_cat'] == 1)])
        TN = len(data[(data['true'] == 1) & (data['predict_cat'] == 0)])
        FP = len(data[(data['true'] == 0) & (data['predict_cat'] == 1)])
        FN = len(data[(data['true'] == 0) & (data['predict_cat'] == 0)])
        fpr.append(FP / (FP + TN))
        tpr.append(TP / (TP + FN))
    return fpr, tpr


def bad_rate_monotone(df, sort_var, target, special_attribute=None):
    """
    坏样本率是否单调
    检验逻辑：两个箱子一定单调；两个以上时，检验是否有峰值点（大于相邻点的值）和谷值点（小于相邻点的值）
    :param df: 包含检验坏样本率的变量，和目标变量
    :param sort_var: 需要检验坏样本率的变量
    :param target: 目标变量，0、1表示好、坏
    :param special_attribute: 不参与检验的特殊值
    :return: 坏样本率单调与否
    """
    if special_attribute is None:
        special_attribute = []
    if len(special_attribute) > 0:
        df2 = df.loc[~df[sort_var].isin(special_attribute)]
    else:
        df2 = df.copy()
    if len(set(df2[sort_var])) <= 2:
        return True
    regroup = bin_bad_rate(df2, sort_var, target)
    bad_rate = list(regroup['bad_rate'])
    bad_rate_notmonotone = [bad_rate[i] > bad_rate[i-1] and bad_rate[i] > bad_rate[i+1] or bad_rate[i] < bad_rate[i-1]
                            and bad_rate[i] < bad_rate[i+1] for i in range(1, len(bad_rate) - 1)]
    if True in bad_rate_notmonotone:
        return False
    else:
        return True

In [14]:
# （i）类别变量取值小于5时，如果每种类别同时包含好坏样本，无需分箱；如果某个类别只包含好样本或坏样本，需要合并
cat_merged_dict = {}
var_bin_list = []
del_less_value_features = []
for col in less_value_features:
    regroup = bin_bad_rate(data_train, col, 'bad')
    bbr_values = regroup['bad_rate']  # 每个类别的坏样本占比
    if min(bbr_values) == 0:  # 某个类别只包含好样本
        combine_bin_dict = merge_cat_bin(data_train, col, 'bad')
        cat_merged_dict[col] = combine_bin_dict
        new_var = col + '_bin'
        data_train[new_var] = data_train[col].map(combine_bin_dict)
        var_bin_list.append(new_var)
        del_less_value_features.append(col)
    if max(bbr_values) == 1:  # 某个类别只包含坏样本
        combine_bin_dict = merge_cat_bin(data_train, col, 'bad', direction='good')
        cat_merged_dict[col] = combine_bin_dict
        new_var = col + '_bin'
        data_train[new_var] = data_train[col].map(combine_bin_dict)
        var_bin_list.append(new_var)
        del_less_value_features.append(col)
less_value_features = [col for col in less_value_features if col not in del_less_value_features]

# （ii）类别变量取值大于5时，需要bad rate编码，再用卡方分箱法分箱
br_encoding_dict = {}  # 记录进行bad rate 编码的变量，及编码方式
for col in more_value_features:
    regroup = bin_bad_rate(data_train, col, 'bad')
    br_encoding = dict(zip(regroup[col], regroup['bad_rate']))
    data_train[col + '_br_encoding'] = data_train[col].map(br_encoding)
    br_encoding_dict[col] = br_encoding
    num_features.append(col + '_br_encoding')

# （iii）对连续变量进行分箱，包括（ii）中的变量
continuous_merged_dict = {}
for col in num_features:
    print('{} is in processing'.format(col))
    max_interval = max_interval_
    if -99999 not in set(data_train[col]):   # 检查是否有特殊值存在，如果没有所有取值都参与分箱
        cut_off = chi_merge(data_train, col, 'bad', max_interval=max_interval,
                                                set_threshold=False)
        new_var = col + '_bin'
        data_train[new_var] = data_train[col].map(lambda x: assign_bin(x, cut_off))
        monotone = bad_rate_monotone(data_train, new_var, 'bad')
        while not monotone:
            max_interval -= 1
            cut_off = chi_merge(data_train, col, 'bad', max_interval=max_interval,
                                                    set_threshold=False)
            data_train[new_var] = data_train[col].map(lambda x: assign_bin(x, cut_off))
            if max_interval == 2:
                # 当分箱数为2时，必然单调
                break
            monotone = bad_rate_monotone(data_train, new_var, 'bad')
        var_bin_list.append(new_var)
    else:
        cut_off = chi_merge(data_train, col, 'bad', max_interval=max_interval,
                                                special_attribute=[-99999], set_threshold=False)
        new_var = col + '_bin'
        data_train[new_var] = data_train[col].map(lambda x: assign_bin(x, cut_off,
                                                                                           special_attribute=[-99999]))
        monotone = bad_rate_monotone(data_train, new_var, 'bad', ['bin_-1'])
        while not monotone:
            max_interval -= 1
            cut_off = chi_merge(data_train, col, 'bad', max_interval=max_interval,
                                                    special_attribute=[-99999], set_threshold=False)
            data_train[new_var] = data_train[col].map(lambda x: assign_bin(x, cut_off,
                                                                                               special_attribute=[-99999]))
            if max_interval == 3:
                # 当分箱数为3-1=2时，必然单调
                break
            monotone = bad_rate_monotone(data_train, new_var, 'bad', ['bin_-1'])
        var_bin_list.append(new_var)
    continuous_merged_dict[col] = cut_off

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


jdmall_user_p0066 is in processing
jdmall_user_p0001 is in processing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


jdmall_user_p0006 is in processing
jdmall_user_p0068 is in processing
jdmall_up_m0001 is in processing
jdmall_up_m0009 is in processing
pay_pay_p0004176 is in processing
catenum is in processing
catenum_3m is in processing
f2_br_encoding is in processing
f3_br_encoding is in processing
f2_br_encoding is in processing
f3_br_encoding is in processing
f2_br_encoding is in processing
f3_br_encoding is in processing


In [16]:
WOE_dict = {}
IV_dict = {}
# 分箱后进行WOE和IV编码的变量包括：
# 1. 初始取值个数小于5，且不需要合并的类别型变量。存放在less_value_features中
# 2. 初始取值个数小于5，需要合并的类别型变量。合并后新的变量存放在var_bin_list中
# 3. 初始取值个数超过5，需要合并的类别型变量。合并后新的变量存放在var_bin_list中
# 4. 连续变量。分箱后新的变量存放在var_bin_list中
all_var = var_bin_list + less_value_features
for var in all_var:
    WOE_dict[var], IV_dict[var] = calc_woe_iv(data_train, var, 'bad')

In [17]:
WOE_dict

{'jdmall_user_p0066_bin': {'Bin_0': -0.10661761131009002,
  'Bin_1': 0.06222736524059458},
 'jdmall_user_p0001_bin': {'Bin_0': 0.22610984202089826,
  'Bin_1': -0.011231373346589517,
  'Bin_2': -0.7951157830317914},
 'jdmall_user_p0006_bin': {'Bin_0': -0.013267832055195085,
  'Bin_1': 1.1858856858347921},
 'jdmall_user_p0068_bin': {'Bin_0': -0.03743008133427491,
  'Bin_1': 0.4559245321521305},
 'jdmall_up_m0001_bin': {'Bin_0': -0.05946068500552629,
  'Bin_1': 0.02334382763627756},
 'jdmall_up_m0009_bin': {'Bin_0': -0.05998545662265461,
  'Bin_1': 0.06137705286310283},
 'pay_pay_p0004176_bin': {'Bin_0': -0.032180881567003494,
  'Bin_1': 0.09313851661908053},
 'catenum_bin': {'Bin_0': 0.02182948891429853, 'Bin_1': -0.6386636062162537},
 'catenum_3m_bin': {'Bin_0': 0.018280525679731048,
  'Bin_1': -1.0654061127717032},
 'f2_br_encoding_bin': {'Bin_0': -0.5633141689744668,
  'Bin_1': -0.05730783164442516,
  'Bin_2': -0.055098583467963265,
  'Bin_3': 0.09249534114783399,
  'Bin_4': 0.4777006

In [18]:
IV_dict

{'jdmall_user_p0066_bin': 0.006630867506591915,
 'jdmall_user_p0001_bin': 0.037143737597943795,
 'jdmall_user_p0006_bin': 0.01571399041985139,
 'jdmall_user_p0068_bin': 0.017041135241000437,
 'jdmall_up_m0001_bin': 0.0013878794530931447,
 'jdmall_up_m0009_bin': 0.0036806013629319314,
 'pay_pay_p0004176_bin': 0.0029965312028889385,
 'catenum_bin': 0.01392562653672294,
 'catenum_3m_bin': 0.019445195513510594,
 'f2_br_encoding_bin': 0.05161053723936894,
 'f3_br_encoding_bin': 0.08909469024434707,
 'f1': 0.03792708164362232}