## 特征分析

In [3]:
%load_ext autoreload
%autoreload 2

import sys
import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from scipy.stats import chi2
import statsmodels.discrete.discrete_model as dm_

sys.path.insert(0, os.path.abspath('../'))

%matplotlib inline

from lec1.preprocessor import *
from lec1.coverage import *

import toad

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 加载数据

In [4]:
data_path = f"{os.path.abspath('.')}/home-credit-default-risk/"

application_train, bureau, bureau_balance, credit_card_balance, \
installment_payments, pos_cash_balance, previous_application = load_data(data_path)

In [5]:
print('application_train:', application_train.shape[0], "rows and", application_train.shape[1],'columns')
print('bureau:', bureau.shape[0], "rows and", bureau.shape[1],'columns')
print('bureau_balance:', bureau_balance.shape[0], "rows and", bureau_balance.shape[1],'columns')
print('credit_card_balance:', credit_card_balance.shape[0], "rows and", credit_card_balance.shape[1],'columns')
print('POS_CASH_balance:', pos_cash_balance.shape[0], "rows and", pos_cash_balance.shape[1],'columns')
print('installments_payments:', installment_payments.shape[0], "rows and", installment_payments.shape[1],'columns')
print('previous_application:', previous_application.shape[0], "rows and", previous_application.shape[1],'columns')


application_train: 307511 rows and 122 columns
bureau: 1716428 rows and 17 columns
bureau_balance: 27299925 rows and 3 columns
credit_card_balance: 3840312 rows and 23 columns
POS_CASH_balance: 10001358 rows and 8 columns
installments_payments: 13605401 rows and 8 columns
previous_application: 1670214 rows and 37 columns


## application_train数据

In [6]:
application_check = toad.detect(application_train)
display(application_check.head(10))

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
SK_ID_CURR,int64,307511,0.00%,307511,278181,102790,100002,103590.0,135692.0,278202.0,367142.0,420729,452713,456255
TARGET,int64,307511,0.00%,2,0.0807288,0.272419,0,0.0,0.0,0.0,0.0,0,1,1
NAME_CONTRACT_TYPE,object,307511,0.00%,2,Cash loans:90.48%,Revolving loans:9.52%,,,,,,,Cash loans:90.48%,Revolving loans:9.52%
CODE_GENDER,object,307511,0.00%,3,F:65.83%,M:34.16%,XNA:0.00%,,,,,F:65.83%,M:34.16%,XNA:0.00%
FLAG_OWN_CAR,object,307511,0.00%,2,N:65.99%,Y:34.01%,,,,,,,N:65.99%,Y:34.01%
FLAG_OWN_REALTY,object,307511,0.00%,2,Y:69.37%,N:30.63%,,,,,,,Y:69.37%,N:30.63%
CNT_CHILDREN,int64,307511,0.00%,15,0.417052,0.722121,0,0.0,0.0,0.0,1.0,2,3,19
AMT_INCOME_TOTAL,float64,307511,0.00%,2548,168798,237123,25650,45000.0,81000.0,147150.0,202500.0,270000,472500,1.17e+08
AMT_CREDIT,float64,307511,0.00%,5603,599026,402491,45000,76410.0,180000.0,513531.0,808650.0,1.13375e+06,1.854e+06,4.05e+06
AMT_ANNUITY,float64,307511,0.00%,13672,27108.6,14493.7,1615.5,6182.91,11074.5,24903.0,34596.0,45954,70006.5,258026


In [7]:
application_check_sub = toad.detect(application_train[['TARGET', 'DAYS_EMPLOYED', 'CODE_GENDER', 'OWN_CAR_AGE', 'NAME_CONTRACT_TYPE', 'FLAG_DOCUMENT_12']])
display(application_check_sub)

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
TARGET,int64,307511,0.00%,2,0.0807288,0.272419,0,0.0,0.0,0.0,0.0,0,1,1
DAYS_EMPLOYED,int64,307511,0.00%,12574,63815,141276,-17912,-10894.9,-4881.0,-1213.0,-289.0,365243,365243,365243
CODE_GENDER,object,307511,0.00%,3,F:65.83%,M:34.16%,XNA:0.00%,,,,,F:65.83%,M:34.16%,XNA:0.00%
OWN_CAR_AGE,float64,307511,65.99%,62,12.0611,11.9448,0,0.0,2.0,9.0,15.0,22,64,91
NAME_CONTRACT_TYPE,object,307511,0.00%,2,Cash loans:90.48%,Revolving loans:9.52%,,,,,,,Cash loans:90.48%,Revolving loans:9.52%
FLAG_DOCUMENT_12,int64,307511,0.00%,2,6.50383e-06,0.00255026,0,0.0,0.0,0.0,0.0,0,0,1


In [8]:
print('top 10 features with high iv')
display(toad.quality(application_train.drop('SK_ID_CURR',axis=1),'TARGET',iv_only=True)[:10])
print('last 10 features with low iv')
display(toad.quality(application_train.drop('SK_ID_CURR',axis=1),'TARGET',iv_only=True)[-10:])


top 10 features with high iv


Unnamed: 0,iv,gini,entropy,unique
EXT_SOURCE_3,0.332758,,,815.0
EXT_SOURCE_2,0.321745,,,119832.0
EXT_SOURCE_1,0.154901,,,114585.0
DAYS_EMPLOYED,0.114698,,,12574.0
AMT_GOODS_PRICE,0.101163,,,1003.0
DAYS_BIRTH,0.090051,,,17460.0
OCCUPATION_TYPE,0.082865,,,19.0
ORGANIZATION_TYPE,0.073368,,,58.0
AMT_CREDIT,0.068424,,,5603.0
NAME_INCOME_TYPE,0.058334,,,8.0


last 10 features with low iv


Unnamed: 0,iv,gini,entropy,unique
FLAG_MOBIL,8.938043e-05,,,2.0
FLAG_DOCUMENT_12,5.775823e-05,,,2.0
FLAG_EMAIL,4.210277e-05,,,2.0
FLAG_DOCUMENT_7,3.818893e-05,,,2.0
FLAG_DOCUMENT_4,3.787674e-05,,,2.0
FLAG_DOCUMENT_19,2.726998e-05,,,2.0
FLAG_DOCUMENT_10,7.552028e-06,,,2.0
FLAG_CONT_MOBILE,1.870741e-06,,,2.0
FLAG_DOCUMENT_5,1.348882e-06,,,2.0
FLAG_DOCUMENT_20,6.161632e-07,,,2.0


# Bureau数据

In [9]:
bureau_check = toad.detect(bureau)
display(bureau_check.head(10))

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
SK_ID_CURR,int64,1716428,0.00%,305811,278215,102939,100001,103618,135602.0,278055.0,367426,420964,452744,456255
SK_ID_BUREAU,int64,1716428,0.00%,1716428,5.92443e+06,532266,5e+06,5.01831e+06,5184870.0,5926300.0,6.38568e+06,6.66063e+06,6.82513e+06,6.84346e+06
CREDIT_ACTIVE,object,1716428,0.00%,4,Closed:62.88%,Active:36.74%,Sold:0.38%,Bad debt:0.00%,,,Closed:62.88%,Active:36.74%,Sold:0.38%,Bad debt:0.00%
CREDIT_CURRENCY,object,1716428,0.00%,4,currency 1:99.92%,currency 2:0.07%,currency 3:0.01%,currency 4:0.00%,,,currency 1:99.92%,currency 2:0.07%,currency 3:0.01%,currency 4:0.00%
DAYS_CREDIT,int64,1716428,0.00%,2923,-1142.11,795.165,-2922,-2867,-2443.0,-987.0,-474,-215,-38,0
CREDIT_DAY_OVERDUE,int64,1716428,0.00%,942,0.818167,36.5444,0,0,0.0,0.0,0,0,0,2792
DAYS_CREDIT_ENDDATE,float64,1716428,6.15%,14096,510.517,4994.22,-42060,-2571,-1922.0,-330.0,474,1334,31029,31199
DAYS_ENDDATE_FACT,float64,1716428,36.92%,2917,-1017.44,714.011,-42023,-2639,-2159.0,-897.0,-425,-178,-24,0
AMT_CREDIT_MAX_OVERDUE,float64,1716428,65.51%,68251,3825.42,206032,0,0,0.0,0.0,0,7594.88,41988.7,1.15987e+08
CNT_CREDIT_PROLONG,int64,1716428,0.00%,10,0.00641041,0.0962239,0,0,0.0,0.0,0,0,0,9


In [10]:
bureau_balance_check = toad.detect(bureau_balance)
display(bureau_balance_check.head(10))

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
SK_ID_BUREAU,int64,27299925,0.00%,817395,6.0363e+06,492349,5.00171e+06,5.012e+06,5.26243e+06,6.07082e+06,6.43195e+06,6.66495e+06,6.82817e+06,6.84289e+06
MONTHS_BALANCE,int64,27299925,0.00%,97,-30.7417,23.8645,-96,-91,-68,-25,-11,-4,0,0
STATUS,object,27299925,0.00%,8,C:49.99%,0:27.47%,X:21.28%,1:0.89%,5:0.23%,1:0.89%,5:0.23%,2:0.09%,3:0.03%,4:0.02%


## 历史信用数据

In [11]:
credit_card_balance_check = toad.detect(credit_card_balance)
display(credit_card_balance_check.head(10))

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
SK_ID_PREV,int64,3840312,0.00%,104307,1904500.0,536469.0,1000020.0,1010020.0,1176860.0,1897120.0,2369330.0,2652010.0,2825320.0,2843500.0
SK_ID_CURR,int64,3840312,0.00%,103558,278324.0,102704.0,100006.0,103570.0,135912.0,278396.0,367580.0,421492.0,452749.0,456250.0
MONTHS_BALANCE,int64,3840312,0.00%,96,-34.5219,26.6678,-96.0,-93.0,-76.0,-28.0,-11.0,-5.0,-1.0,-1.0
AMT_BALANCE,float64,3840312,0.00%,1347904,58300.2,106307.0,-420250.0,0.0,0.0,0.0,89046.7,180016.0,466295.0,1505900.0
AMT_CREDIT_LIMIT_ACTUAL,int64,3840312,0.00%,181,153808.0,165146.0,0.0,0.0,0.0,112500.0,180000.0,427500.0,765000.0,1350000.0
AMT_DRAWINGS_ATM_CURRENT,float64,3840312,19.52%,2267,5961.32,28225.7,-6827.31,0.0,0.0,0.0,0.0,6750.0,135000.0,2115000.0
AMT_DRAWINGS_CURRENT,float64,3840312,0.00%,187005,7433.39,33846.1,-6211.62,0.0,0.0,0.0,0.0,11250.0,157500.0,2287100.0
AMT_DRAWINGS_OTHER_CURRENT,float64,3840312,19.52%,1832,288.17,8201.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1529850.0
AMT_DRAWINGS_POS_CURRENT,float64,3840312,19.52%,168748,2968.8,20796.9,0.0,0.0,0.0,0.0,0.0,0.0,77080.6,2239270.0
AMT_INST_MIN_REGULARITY,float64,3840312,7.95%,312266,3540.2,5600.15,0.0,0.0,0.0,0.0,6633.91,9000.0,23312.8,202882.0


In [12]:
pos_cash_balance_check = toad.detect(pos_cash_balance)
display(pos_cash_balance_check.head(10))

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
SK_ID_PREV,int64,10001358,0.00%,936325,1.90322e+06,535847,1e+06,1.0119e+06,1.17690e+06,1.89656e+06,2.36896e+06,2.65303e+06,2.82422e+06,2.8435e+06
SK_ID_CURR,int64,10001358,0.00%,337252,278404,102764,100001,103540,135788,278654,367429,421391,452772,456255
MONTHS_BALANCE,int64,10001358,0.00%,96,-35.0126,26.0666,-96,-94,-77,-28,-13,-6,-2,-1
CNT_INSTALMENT,float64,10001358,0.26%,73,17.0897,11.9951,1,4,6,12,24,36,60,92
CNT_INSTALMENT_FUTURE,float64,10001358,0.26%,79,10.4838,11.1091,0,0,0,7,14,24,53,85
NAME_CONTRACT_STATUS,object,10001358,0.00%,9,Active:91.50%,Completed:7.45%,Signed:0.87%,Demand:0.07%,Returned to the store:0.05%,Returned to the store:0.05%,Approved:0.05%,Amortized debt:0.01%,Canceled:0.00%,XNA:0.00%
SK_DPD,int64,10001358,0.00%,3400,11.6069,132.714,0,0,0,0,0,0,235,4231
SK_DPD_DEF,int64,10001358,0.00%,2307,0.654468,32.7625,0,0,0,0,0,0,1,3595


In [13]:
installment_payments_check = toad.detect(installment_payments)
display(installment_payments_check.head(10))

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
SK_ID_PREV,int64,13605401,0.00%,997752,1903365.0,536202.905546,1000001.0,1011180.0,1176563.0,1896520.0,2369094.0,2652602.0,2824545.0,2843499.0
SK_ID_CURR,int64,13605401,0.00%,339587,278444.9,102718.310411,100001.0,103583.0,135870.0,278685.0,367530.0,421540.0,452802.0,456255.0
NUM_INSTALMENT_VERSION,float64,13605401,0.00%,65,0.8566373,1.035216,0.0,0.0,0.0,1.0,1.0,1.0,4.0,178.0
NUM_INSTALMENT_NUMBER,int64,13605401,0.00%,277,18.8709,26.664067,1.0,1.0,2.0,8.0,19.0,56.0,121.0,277.0
DAYS_INSTALMENT,float64,13605401,0.00%,2922,-1042.27,800.946284,-2922.0,-2832.0,-2307.0,-818.0,-361.0,-150.0,-21.0,-1.0
DAYS_ENTRY_PAYMENT,float64,13605401,0.02%,3039,-1051.114,800.585883,-4921.0,-2837.0,-2316.0,-827.0,-370.0,-159.0,-28.0,-1.0
AMT_INSTALMENT,float64,13605401,0.00%,902539,17050.91,50570.254429,0.0,13.275,717.705,8884.08,16710.21,31415.175,146068.7,3771487.845
AMT_PAYMENT,float64,13605401,0.02%,944235,17238.22,54735.783981,0.0,3.78,331.83,8125.515,16108.425,31179.915,178004.0,3771487.845


In [14]:
previous_application_check = toad.detect(previous_application)
display(previous_application_check.head(10))

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
SK_ID_PREV,int64,1670214,0.00%,1670214,1.92309e+06,532598,1e+06,1.01857e+06,1.18503e+06,1.92311e+06,2.38428e+06,2.66086e+06,2.82691e+06,2.84538e+06
SK_ID_CURR,int64,1670214,0.00%,338857,278357,102815,100001,103494,135738,278714,367514,421359,452785,456255
NAME_CONTRACT_TYPE,object,1670214,0.00%,4,Cash loans:44.76%,Consumer loans:43.66%,Revolving loans:11.57%,XNA:0.02%,,,Cash loans:44.76%,Consumer loans:43.66%,Revolving loans:11.57%,XNA:0.02%
AMT_ANNUITY,float64,1670214,22.29%,357959,15955.1,14782.1,0,2137.18,3817.53,11250,20658.4,34703.8,69685.8,418058
AMT_APPLICATION,float64,1670214,0.00%,93885,175234,292780,0,0,0,71046,180360,450000,1.35e+06,6.90516e+06
AMT_CREDIT,float64,1670214,0.00%,86803,196114,318575,0,0,0,80541,216418,533160,1.51538e+06,6.90516e+06
AMT_DOWN_PAYMENT,float64,1670214,53.64%,29278,6697.4,20921.5,-0.9,0,0,1638,7740,17109,65930.9,3.06004e+06
AMT_GOODS_PRICE,float64,1670214,23.08%,93885,227847,315397,0,15300,30555,112320,234000,585000,1.395e+06,6.90516e+06
WEEKDAY_APPR_PROCESS_START,object,1670214,0.00%,7,TUESDAY:15.27%,WEDNESDAY:15.27%,MONDAY:15.18%,FRIDAY:15.09%,THURSDAY:14.91%,MONDAY:15.18%,FRIDAY:15.09%,THURSDAY:14.91%,SATURDAY:14.41%,SUNDAY:9.86%
HOUR_APPR_PROCESS_START,int64,1670214,0.00%,24,12.4842,3.33403,0,5,8,12,15,17,20,23


# 高缺失值特征要去掉
# 过于不均衡的特征去掉
# 低IV的特征去掉
* 使用toad的quality check 设置 iv_only 来只计算IV