特征工程：
1.利用IV值进行变量选择
IV值即信息价值，表示变量对目标变量的预测能力，IV值越高，变量对目标变量的预测能力越强。
接下来利用IV值进行过变量选择，IV值计算代码来自Datawhale。

In [21]:
import numpy as np
import math
import pandas as pd
from sklearn.utils.multiclass import type_of_target
from scipy import stats

In [22]:
data = pd.read_csv('d:/data_cle.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Data columns (total 82 columns):
low_volume_percent                            4754 non-null float64
middle_volume_percent                         4754 non-null float64
take_amount_in_later_12_month_highest         4754 non-null int64
trans_amount_increase_rate_lately             4754 non-null float64
trans_activity_month                          4754 non-null float64
trans_activity_day                            4754 non-null float64
transd_mcc                                    4754 non-null int64
trans_days_interval_filter                    4754 non-null int64
trans_days_interval                           4754 non-null int64
regional_mobility                             4754 non-null int64
repayment_capability                          4754 non-null int64
is_high_user                                  4754 non-null int64
number_of_trans_from_2011                     4754 non-null int64
historical_trans_amount  

In [4]:
#求woe值和iv值
def woe(X, y, event):
    res_woe = []  #列表存放woe字典
    res_iv = []  #列表存放iv
    X1 = feature_discretion(X) #对连续型特征进行处理
    for i in range(0, X1.shape[-1]):  #遍历所有特征
        x = X1[:, i]  #单个特征
        woe_dict, iv1 = woe_single_x(x, y, event)  #计算单个特征的woe值
        res_woe.append(woe_dict)
        res_iv.append(iv1)
    return np.array(res_woe), np.array(res_iv)  #返回数组

#求单个特征的woe值
def woe_single_x(x, y, event):
    event_total, non_event_total = count_binary(y, event) #计算好人坏人总数
    x_labels = np.unique(x) #特征中的分段
    woe_dict = {}  #存放每个分段的名称 以及 其对应的woe值
    iv = 0
    for x1 in x_labels: #遍历每个分段
        y1 = y[np.where(x == x1)[0]]
        event_count, non_event_count = count_binary(y1, event=event)
        rate_event = 1.0 * event_count / event_total
        rate_non_event = 1.0 * non_event_count / non_event_total
        #woe无穷大时处理
        if rate_event == 0:
            print()#print("{'",x1,"'}"+":全是好人") #只输出不做处理
        elif rate_non_event == 0:
            print()#print("{'",x1,"'}"+":全是坏人")
        else:
            woe1 = math.log(rate_event / rate_non_event)
            woe_dict[x1] = woe1  
            iv += (rate_event - rate_non_event) * woe1
    return woe_dict, iv

#计算个数
def count_binary(a, event):
    event_count = (a == event).sum()
    non_event_count = a.shape[-1] - event_count
    return event_count, non_event_count

#判断特征数据是否为离散型
def feature_discretion(X):
    temp = []
    for i in range(0, X.shape[-1]):
        x = X[:, i]
        x_type = type_of_target(x)
        if pd.Series(list(x)).dtype != 'O':
            x1 = discrete(x)
            temp.append(x1)
        else:
            temp.append(x)
    return np.array(temp).T

#对连续型特征进行离散化
def discrete(x):
    res = np.array([0] * x.shape[-1], dtype=int)
    for i in range(5):
        point1 = stats.scoreatpercentile(x, i * 20)
        point2 = stats.scoreatpercentile(x, (i + 1) * 20)
        x1 = x[np.where((x >= point1) & (x <= point2))]
        mask = np.in1d(x, x1)
        res[mask] = (i + 1)
    return res

In [23]:
y = data['status'].values
x = data.drop('status', axis=1).values

In [24]:
a, b = woe(x, y, 1)

In [25]:
name = data.drop('status', axis=1).columns

In [26]:
dic = dict(zip(name, b))
dic_sort= sorted(dic.items(),key = lambda x:x[1],reverse = True)

In [27]:
dic_sort

[('trans_fail_top_count_enum_last_1_month', 0.5738251555141427),
 ('history_fail_fee', 0.49612183165064383),
 ('loans_score', 0.43866404297963046),
 ('loans_overdue_count', 0.3550896704777049),
 ('apply_score', 0.3476071802808357),
 ('trans_fail_top_count_enum_last_12_month', 0.2710896194189564),
 ('trans_fail_top_count_enum_last_6_month', 0.2599109333204192),
 ('latest_one_month_fail', 0.14450520167800465),
 ('latest_one_month_suc', 0.1192399447896281),
 ('rank_trad_1_month', 0.09658511076715653),
 ('max_cumulative_consume_later_1_month', 0.09410939090142396),
 ('trans_day_last_12_month', 0.08888942317190554),
 ('trans_top_time_last_1_month', 0.0826299051197211),
 ('pawns_auctions_trusts_consume_last_1_month', 0.05951063024820402),
 ('consfin_avg_limit', 0.05889557812498972),
 ('top_trans_count_last_1_month', 0.05865056716321834),
 ('trans_amount_3_month', 0.04338806330064936),
 ('consfin_credit_limit', 0.042782256187366335),
 ('latest_query_day', 0.04154244196885145),
 ('consume_top_

2.利用随机森林进行变量选择
主要根据模型输出的feature_importance进行变量选择

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
x = data.drop('status', axis=1)
y = data['status']

In [31]:
rf = RandomForestClassifier()
rf.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
fm = rf.feature_importances_
dic_rf = dict(zip(x.columns, fm))
dic_sort_rf = sorted(dic_rf.items(), key=lambda x: x[1], reverse=True)

In [36]:
dic_sort_rf

[('trans_fail_top_count_enum_last_1_month', 0.06116028881863048),
 ('loans_score', 0.04583306843915648),
 ('history_fail_fee', 0.045731984291196945),
 ('apply_score', 0.026261568482083154),
 ('latest_query_day', 0.02254387339621276),
 ('max_cumulative_consume_later_1_month', 0.020688172348093756),
 ('loans_overdue_count', 0.019958442485439036),
 ('trans_amount_3_month', 0.01895899964141306),
 ('trans_activity_day', 0.01866420783478019),
 ('first_transaction_day', 0.01825998732247166),
 ('latest_one_month_fail', 0.01794663627751045),
 ('avg_price_last_12_month', 0.017862325684140472),
 ('repayment_capability', 0.017737367890553475),
 ('loans_avg_limit', 0.016512138536738774),
 ('trans_top_time_last_6_month', 0.016395304557166042),
 ('trans_amount_increase_rate_lately', 0.016299815734742935),
 ('trans_fail_top_count_enum_last_12_month', 0.015935730650635638),
 ('historical_trans_day', 0.015575746109557809),
 ('trans_days_interval', 0.01490338549492998),
 ('abs', 0.014608793542266883),
 (

3.采用互信息进行变量选择

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [38]:
mf = mutual_info_classif(x, y)

In [40]:
dic_mf = dict(zip(x.columns, mf))
dic_sort_mf = sorted(dic_mf.items(), key=lambda x:x[1], reverse=True)

In [41]:
dic_sort_mf

[('trans_fail_top_count_enum_last_1_month', 0.07055262151591957),
 ('loans_score', 0.0572266437685065),
 ('history_fail_fee', 0.053906781982533625),
 ('loans_overdue_count', 0.0483835864627542),
 ('apply_score', 0.03923662307936948),
 ('latest_one_month_fail', 0.03154375514983854),
 ('trans_fail_top_count_enum_last_6_month', 0.03049331700710667),
 ('latest_query_day', 0.021427321616209527),
 ('max_cumulative_consume_later_1_month', 0.019426923420200026),
 ('trans_fail_top_count_enum_last_12_month', 0.01788811892743225),
 ('loans_long_time', 0.017522038100941772),
 ('trans_amount_increase_rate_lately', 0.015533945487086642),
 ('consfin_org_count_current', 0.015170090089650534),
 ('loans_product_count', 0.013817245610021622),
 ('latest_one_month_suc', 0.013679846969175014),
 ('pawns_auctions_trusts_consume_last_1_month', 0.013490341869606182),
 ('middle_volume_percent', 0.013205835337825267),
 ('query_finance_count', 0.01287986637114824),
 ('latest_three_month_loan', 0.011420768156729766