In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

## DAY01:

In [2]:
df = pd.read_csv('data.csv', encoding = 'gbk')

### 1. 查看数据

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,5,10,12,13,14
custid,2791858,534047,2849787,1809708,2499829
trade_no,20180507115231274000000023057383,20180507121002192000000023073000,20180507125159718000000023114911,20180507121358683000000388283484,20180507115448545000000388205844
bank_card_no,卡号1,卡号1,卡号1,卡号1,卡号1
low_volume_percent,0.01,0.02,0.04,0,0.01
middle_volume_percent,0.99,0.94,0.96,0.96,0.99
take_amount_in_later_12_month_highest,0,2000,0,2000,0
trans_amount_increase_rate_lately,0.9,1.28,1,0.13,0.46
trans_activity_month,0.55,1,1,0.57,1
trans_activity_day,0.313,0.458,0.114,0.777,0.175


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Data columns (total 90 columns):
Unnamed: 0                                    4754 non-null int64
custid                                        4754 non-null int64
trade_no                                      4754 non-null object
bank_card_no                                  4754 non-null object
low_volume_percent                            4752 non-null float64
middle_volume_percent                         4752 non-null float64
take_amount_in_later_12_month_highest         4754 non-null int64
trans_amount_increase_rate_lately             4751 non-null float64
trans_activity_month                          4752 non-null float64
trans_activity_day                            4752 non-null float64
transd_mcc                                    4752 non-null float64
trans_days_interval_filter                    4746 non-null float64
trans_days_interval                           4752 non-null float64
regional_mobility

In [5]:
target = df['status']
df.drop(['status'], axis = 1, inplace = True)
df.shape

(4754, 89)

### 2. 缺失值处理

In [6]:
dict1 = df.isnull().sum()

In [7]:
dict1

Unnamed: 0                                  0
custid                                      0
trade_no                                    0
bank_card_no                                0
low_volume_percent                          2
middle_volume_percent                       2
take_amount_in_later_12_month_highest       0
trans_amount_increase_rate_lately           3
trans_activity_month                        2
trans_activity_day                          2
transd_mcc                                  2
trans_days_interval_filter                  8
trans_days_interval                         2
regional_mobility                           2
student_feature                          2998
repayment_capability                        0
is_high_user                                0
number_of_trans_from_2011                   2
first_transaction_time                      2
historical_trans_amount                     0
historical_trans_day                        2
rank_trad_1_month                 

In [8]:
for key in list(dict1.keys()):
    if dict1[key] == 0:
        print('删除没有缺失值的列', key)
        del dict1[key]
    elif dict1[key] > 2000:
        print('删除缺失值过多的列',key)
        df.drop([key],axis = 1,inplace = True)
        del dict1[key]

删除没有缺失值的列 Unnamed: 0
删除没有缺失值的列 custid
删除没有缺失值的列 trade_no
删除没有缺失值的列 bank_card_no
删除没有缺失值的列 take_amount_in_later_12_month_highest
删除缺失值过多的列 student_feature
删除没有缺失值的列 repayment_capability
删除没有缺失值的列 is_high_user
删除没有缺失值的列 historical_trans_amount
删除没有缺失值的列 trans_amount_3_month
删除没有缺失值的列 abs
删除没有缺失值的列 avg_price_last_12_month
删除没有缺失值的列 max_cumulative_consume_later_1_month
删除没有缺失值的列 pawns_auctions_trusts_consume_last_1_month
删除没有缺失值的列 pawns_auctions_trusts_consume_last_6_month
删除没有缺失值的列 source


In [9]:
len(dict1)

73

In [10]:
# 数值型数据用均值填充
for key in dict1.keys():
    if df[key].dtype != object:
        df[key].fillna(df[key].mean(), inplace = True)
    else:
        print('object类型的缺失值和个数',key,dict1[key])

object类型的缺失值和个数 reg_preference_for_trad 2
object类型的缺失值和个数 id_name 276
object类型的缺失值和个数 latest_query_time 304
object类型的缺失值和个数 loans_latest_time 297


In [11]:
# 处理object类型数据的缺失值
df['reg_preference_for_trad'].value_counts()

一线城市    3403
三线城市    1064
境外       150
二线城市     131
其他城市       4
Name: reg_preference_for_trad, dtype: int64

In [12]:
df['reg_preference_for_trad'].fillna('其他城市', inplace = True)
df['reg_preference_for_trad'].isnull().sum()

0

In [13]:
# id_name 没有意义
df.drop(['id_name'], axis = 1, inplace = True)

In [14]:
# 时间类型众数填充
arr = ['latest_query_time', 'loans_latest_time']
for row in arr:
    df[row].fillna(df[row].mode()[0], inplace = True)
    print(df[row].isnull().sum())

0
0


In [15]:
# 检验效果
df.isnull().sum().sum()

0

### 3. 数据处理

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Data columns (total 87 columns):
Unnamed: 0                                    4754 non-null int64
custid                                        4754 non-null int64
trade_no                                      4754 non-null object
bank_card_no                                  4754 non-null object
low_volume_percent                            4754 non-null float64
middle_volume_percent                         4754 non-null float64
take_amount_in_later_12_month_highest         4754 non-null int64
trans_amount_increase_rate_lately             4754 non-null float64
trans_activity_month                          4754 non-null float64
trans_activity_day                            4754 non-null float64
transd_mcc                                    4754 non-null float64
trans_days_interval_filter                    4754 non-null float64
trans_days_interval                           4754 non-null float64
regional_mobility

In [17]:
df['Unnamed: 0'].value_counts()

2047     1
6774     1
8825     1
637      1
4735     1
10880    1
8833     1
6786     1
645      1
9143     1
4743     1
10888    1
8841     1
4747     1
10892    1
2704     1
6802     1
2680     1
8821     1
8807     1
6770     1
10902    1
605      1
6750     1
10848    1
609      1
4707     1
8805     1
6758     1
2664     1
        ..
3371     1
11567    1
9178     1
7473     1
1330     1
3379     1
5428     1
7477     1
1334     1
3383     1
7481     1
7461     1
5412     1
11444    1
5392     1
3331     1
3335     1
1290     1
9568     1
9486     1
10317    1
11308    1
3359     1
10945    1
3347     1
7445     1
1302     1
3355     1
7453     1
8192     1
Name: Unnamed: 0, Length: 4754, dtype: int64

In [18]:
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [19]:
# 数值类数据归一化
for i in df.columns:
    if df[i].dtype == object:
        print(i)
    else:
        df[i] = (df[i]- df[i].min())/(df[i].max()-df[i].min())
df.head()

trade_no
bank_card_no
reg_preference_for_trad
source
latest_query_time
loans_latest_time


Unnamed: 0,custid,trade_no,bank_card_no,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,transd_mcc,...,loans_max_limit,loans_avg_limit,consfin_credit_limit,consfin_credibility,consfin_org_count_current,consfin_product_count,consfin_max_limit,consfin_avg_limit,latest_query_day,loans_latest_day
0,0.697138,20180507115231274000000023057383,卡号1,0.01,0.99,0.0,1.9e-05,0.488636,0.30837,0.375,...,0.29,0.244638,0.013777,0.862069,0.055556,0.1,0.004505,0.014493,0.038674,0.061538
1,0.133331,20180507121002192000000023073000,卡号1,0.02,0.94,0.029412,2.7e-05,1.0,0.468062,0.425,...,0.35,0.254783,0.173364,0.91954,0.277778,0.3,0.085586,0.113043,0.016575,0.012308
2,0.711603,20180507125159718000000023114911,卡号1,0.04,0.96,0.0,2.1e-05,1.0,0.089207,0.275,...,0.16,0.181159,0.04822,1.0,0.055556,0.05,0.015766,0.050725,0.01105,0.024615
3,0.451881,20180507121358683000000388283484,卡号1,0.0,0.96,0.029412,3e-06,0.511364,0.819383,0.5,...,0.32,0.223333,0.187141,0.91954,0.277778,0.25,0.112613,0.147101,0.01105,0.018462
4,0.624214,20180507115448545000000388205844,卡号1,0.01,0.99,0.0,1e-05,1.0,0.156388,0.275,...,0.23,0.236232,0.095293,0.908046,0.111111,0.1,0.031532,0.099638,0.066298,0.375385


In [20]:
# 非数值类型One-Hot
df['bank_card_no'].value_counts()

卡号1    4754
Name: bank_card_no, dtype: int64

In [21]:
df.drop(['bank_card_no'], axis = 1, inplace = True)

In [22]:
df.reg_preference_for_trad.value_counts()

一线城市    3403
三线城市    1064
境外       150
二线城市     131
其他城市       6
Name: reg_preference_for_trad, dtype: int64

In [23]:
df = df.join(pd.get_dummies(df['reg_preference_for_trad']))
df.drop(['reg_preference_for_trad'],axis = 1,inplace = True)

In [24]:
df['latest_query_time'].value_counts()

2018-04-14    727
2018-05-06    223
2018-05-05    214
2018-05-04    210
2018-05-02    204
2018-05-03    201
2018-04-13    177
2018-04-23    166
2018-04-28    139
2018-05-01    133
2018-04-26    132
2018-04-27    131
2018-04-29    129
2018-04-25    120
2018-04-30    116
2018-04-24    101
2018-04-21     90
2018-04-19     85
2018-04-16     81
2018-04-20     78
2018-04-22     74
2018-04-17     70
2018-04-15     69
2018-04-18     62
2018-04-02     48
2018-04-11     45
2018-04-10     39
2018-03-27     36
2018-04-05     33
2018-04-04     31
             ... 
2017-05-13      1
2017-05-24      1
2017-09-28      1
2018-02-18      1
2017-09-30      1
2017-07-04      1
2017-12-15      1
2018-02-16      1
2017-11-10      1
2017-09-05      1
2018-01-17      1
2017-11-28      1
2017-07-05      1
2017-11-30      1
2017-05-26      1
2017-09-18      1
2017-12-10      1
2018-02-24      1
2018-01-19      1
2017-07-19      1
2017-09-13      1
2017-07-09      1
2018-01-21      1
2018-01-04      1
2017-09-29

In [25]:
df['latest_query_time'] = pd.to_datetime(df['latest_query_time'],format='%Y-%m-%d')
df['loans_latest_time'] = pd.to_datetime(df['loans_latest_time'],format='%Y-%m-%d')

In [26]:
df['query_year'] = [i.year for i in df['latest_query_time']]
df['query_month'] = [i.month for i in df['latest_query_time']]
df['loans_year'] = [i.year for i in df['loans_latest_time']]
df['loans_month'] = [i.month for i in df['loans_latest_time']]

In [27]:
cols = ['query_year','query_month','loans_year','loans_month']
for col in cols:
    df = df.join(pd.get_dummies(df[col]),how='left', lsuffix='_left', rsuffix='_right')

In [28]:
cols = ['query_year','query_month','loans_year','loans_month','latest_query_time','loans_latest_time']
for col in cols:
    df.drop([col],axis = 1,inplace = True)

In [29]:
df.source.value_counts()

xs    4754
Name: source, dtype: int64

In [30]:
df.drop(['source'], axis = 1, inplace = True)

In [31]:
df.trade_no.value_counts()

20180504164148660000000381457864    1
20180507124217414000000388392277    1
20180507122925749000000388344503    1
20180507120335147000000388240608    1
20180507123036048000000388346016    1
20180507123025777000000388349473    1
20180507115551791000000388208135    1
20180504164110002000000381451890    1
20180507115530513000000388209799    1
20180504163644686000000021186159    1
20180507120651257000000388253493    1
20180507121722686000000023083057    1
20180504184057806000000381902091    1
20180507115224035000000388198978    1
20180507121751020000000388292695    1
20180507124851468000000388419360    1
20180504155214733000000381233684    1
20180504172749786000000021237355    1
20180504161919304000000381359812    1
20180507125115078000000023113976    1
20180504182022079000000381847944    1
20180507115551969000000388210122    1
20180507122632070000000388331919    1
20180504170138492000000381541358    1
20180507115150271000000023055690    1
20180507122013242000000023084594    1
201805071205

In [32]:
# trade_no 属于类别型数据，需要删除
df.drop(['trade_no'],axis = 1,inplace = True)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Columns: 113 entries, custid to 12_right
dtypes: float64(80), uint8(33)
memory usage: 3.1 MB


### 4. 数据划分

In [34]:
from sklearn.model_selection import  train_test_split
x_train, x_test, y_train, y_test = train_test_split(df,target,train_size=0.7, random_state=2018)



In [35]:
print(x_train.shape,y_train.shape)

(3327, 113) (3327,)


In [36]:
from sklearn import ensemble
model_random_forest_Classifier = ensemble.RandomForestClassifier(n_estimators=20)
model_random_forest_Classifier.fit(x_train,y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [37]:
model_random_forest_Classifier.score(x_test,y_test)

0.7729502452697968

In [38]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
model = SGDClassifier(#lambda:
    loss='log',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1,
    class_weight=None
)

In [39]:
model.fit(x_train,y_train)

SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.01,
       learning_rate='optimal', loss='log', max_iter=100, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [40]:
print(model.score(x_test,y_test))

0.7603363700070077


## DAY02

### 随机森林挑选特征

In [44]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=20, random_state=2019, n_jobs=-1)
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=2019, verbose=0,
            warm_start=False)

In [45]:
importance = forest.feature_importances_
imp_result = np.argsort(importance)[::-1]

for i in range(x_train.shape[1]):
    print("%2d. %-*s %f"%(i+1, 30, x_train.columns[i], importance[imp_result[i]]))

 1. custid                         0.051384
 2. low_volume_percent             0.047571
 3. middle_volume_percent          0.039346
 4. take_amount_in_later_12_month_highest 0.028270
 5. trans_amount_increase_rate_lately 0.026379
 6. trans_activity_month           0.017269
 7. trans_activity_day             0.016808
 8. transd_mcc                     0.016619
 9. trans_days_interval_filter     0.016210
10. trans_days_interval            0.016071
11. regional_mobility              0.015513
12. repayment_capability           0.015430
13. is_high_user                   0.014900
14. number_of_trans_from_2011      0.014817
15. first_transaction_time         0.014465
16. historical_trans_amount        0.014352
17. historical_trans_day           0.014310
18. rank_trad_1_month              0.014288
19. trans_amount_3_month           0.014230
20. avg_consume_less_12_valid_month 0.014081
21. abs                            0.013953
22. top_trans_count_last_1_month   0.013780
23. avg_price_last_12

In [46]:
threshold = 0.01
data_index = list(x_train.columns[importance < threshold])
x_train.drop(data_index,axis=1,inplace=True)
x_test.drop(data_index,axis = 1,inplace = True)
assert x_train.shape[1] == x_test.shape[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [47]:
from sklearn import ensemble
model_random_forest_Classifier = ensemble.RandomForestClassifier(n_estimators=20,random_state = 2019)
model_random_forest_Classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=2019, verbose=0,
            warm_start=False)

In [48]:
model_random_forest_Classifier.score(x_test,y_test)

0.7687456201822004

关于VI值的参考链接：https://www.jianshu.com/p/bd350351aa5c

IV值（Information Value），中文意思是信息价值，或者信息量。是评分卡模型中的一个常见指标，“用IV去衡量变量预测能力”，在金融风控领域得到了广泛的应用。

因为计算过程中用的是命中黑白样本各自的比例，所以在工程实践中，一定程度上规避了黑白样本数的不同选择所带来的偏差。

并不是IV值越大越好，当IV值大于0.5时，我们需要对这个特征打个疑问，因为它过于太好而显得不够真实。**通常我们会选择IV值在0.1~0.5这个范围的特征。** 可能不同场景在取值的细节上会有所不同。IV < 0.02 属于无效预测。

In [49]:
import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target

def woe(X, y, event=1):  
    res_woe = []
    iv_dict = {}
    for feature in X.columns:
        x = X[feature].values
        # 1) 连续特征离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        # 2) 计算该特征的woe和iv
        # woe_dict, iv = woe_single_x(x, y, feature, event)
        woe_dict, iv = woe_single_x(x, y, feature, event)
        iv_dict[feature] = iv
        res_woe.append(woe_dict) 
        
    return iv_dict
        
def discrete(x):
    # 使用5等分离散化特征
    res = np.zeros(x.shape)
    for i in range(5):
        point1 = stats.scoreatpercentile(x, i * 20)
        point2 = stats.scoreatpercentile(x, (i + 1) * 20)
        x1 = x[np.where((x >= point1) & (x <= point2))]
        mask = np.in1d(x, x1)
        res[mask] = i + 1    # 将[i, i+1]块内的值标记成i+1
    return res

def woe_single_x(x, y, feature,event = 1):
    # event代表预测正例的标签
    event_total = sum(y == event)
    non_event_total = y.shape[-1] - event_total
    
    iv = 0
    woe_dict = {}
    for x1 in set(x):    # 遍历各个块
        y1 = y.reindex(np.where(x == x1)[0])
        event_count = sum(y1 == event)
        non_event_count = y1.shape[-1] - event_count
        rate_event = event_count / event_total    
        rate_non_event = non_event_count / non_event_total
        
        if rate_event == 0:
            rate_event = 0.0001
            # woei = -20
        elif rate_non_event == 0:
            rate_non_event = 0.0001
            # woei = 20
        woei = math.log(rate_event / rate_non_event)
        woe_dict[x1] = woei
        iv += (rate_event - rate_non_event) * woei
    return woe_dict, iv

In [50]:
iv_dict = woe(x_train, y_train)
iv = sorted(iv_dict.items(), key = lambda x:x[1],reverse = True)
print(len(iv))
iv

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


47


[('history_fail_fee', 0.23601952893571299),
 ('latest_six_month_apply', 0.23490510480064064),
 ('latest_query_day', 0.23165030755336188),
 ('history_suc_fee', 0.23132587006862826),
 ('trans_days_interval', 0.23127346695672282),
 ('trans_activity_day', 0.23089021521474926),
 ('avg_price_last_12_month', 0.23076797912604352),
 ('apply_score', 0.23068994910029134),
 ('trans_amount_3_month', 0.23010632257866417),
 ('loans_avg_limit', 0.22937233933022275),
 ('consfin_credibility', 0.22804472290267083),
 ('historical_trans_day', 0.22785892580201067),
 ('trans_day_last_12_month', 0.2258295769673027),
 ('trans_amount_increase_rate_lately', 0.22579260485931757),
 ('first_transaction_day', 0.22577667440590374),
 ('first_transaction_time', 0.22558029316437583),
 ('middle_volume_percent', 0.22535903805135094),
 ('query_org_count', 0.22529059153249648),
 ('trans_top_time_last_6_month', 0.22440575809597219),
 ('trans_fail_top_count_enum_last_1_month', 0.2242031888186113),
 ('latest_six_month_loan', 0

In [51]:
useless = []
for feature in x_train.columns:
    if feature in [t[1] for t in iv[30:]]:
        useless.append(feature)
        print(feature, iv_dict[feature])

In [52]:
x_train.drop(useless, axis = 1, inplace = True)
x_test.drop(useless, axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [53]:
model_random_forest_Classifier = ensemble.RandomForestClassifier(n_estimators=20,random_state = 2019)
model_random_forest_Classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=2019, verbose=0,
            warm_start=False)

In [54]:
model_random_forest_Classifier.score(x_test,y_test)

0.7687456201822004