# FeatureAnalysis模块使用指南

### 一，模块整体架构

![](FeatureAnalysis特征分析模块.png)

#### outlier_analysis的输出：

In [3]:
'med', #中位数
'seg_25', #1/4分位数
'seg_75', #3/4分位数
'up_limit',  #离群值判定上边界
'low_limit', #离群值判定下边界
'up_ratio',  #超上边界离群值比例
'low_ratio';  #超下边界离群值比例


#### basic_analysis的输出：

In [2]:
#------覆盖率------------------------#
'not_nan_ratio',  #非空比例，通常覆盖率coverage即指它
'not_zero_ratio', #非零比例，非零值不含空值
'not_outlier_ratio', #非离群值比例，非离群值不含空值

#------统计值------------------------#
'class_num', #数据类别数目
'value_num', #非空数据数目
'min', #最小值
'mean',#均值
'med', #中位数
'most', #众数
'max', #最大值

#------有效性----------------------#
'ks(continous feature)', #ks统计量，适合连续特征
'ks_pvalue', #ks统计量的p值
'chi2(discrete feature)', #chi2统计量，适合离散特征
'chi2_pvalue', #chi2统计量的p值
't(for mean)', #均值t检验,仅对连续特征适用
't_pvalue' ,#均值t检验的p值
'z(for coverage)',#覆盖率z检验，适合连续和离散特征，coverage指 not_nan_ratio
'z_pvalue'; #覆盖率z检验的p值


#### psi_analysis的输出：

In [4]:
'psi', #psi指标，仅当 train_data和 test_data 有效数据数量 >10时才取值，否则为 nan值
'is_stable', #是否稳定，psi<0.2判定为稳定
'train_class_num', # train_data中数据类别数目
'test_class_num' , # test_data中数据类别数目
'train_value_num', #train_data中有效数据数目
'test_value_num';#test_data中有效数据数目


#### ks_analysis的输出：

In [5]:
'feature_interval',#特征取值区间
'order_num', #订单数量
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单占比
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'overdue_cum_ratio', #累计逾期订单比例
'normal_cum_ratio', #累计正常订单比例
'ks_value'; #ks统计值


#### chi2_analysis的输出：

In [6]:
'TP', #feature为1的逾期样本数量
'FP', #feature为1的正常样本数量
'TN', #feature为0的正常样本数量
'FN', #feature为0的逾期的样本数量
'TPR', #TP/(TP+FN),逾期样本中feature取1比例
'FPR',#FP/(FP+TN),正常样本中feature取1比例
'overdue_ratio_0',# feature为0样本的逾期率
'overdue_ratio_1',# feature为1样本的逾期率
'precision',#精度
'accuracy',#准确度
'chi2', #卡方统计量
'chi2_pvalue'; #卡方统计量的p值


### 二，单特征分析示范

In [12]:
import numpy as np
import pandas as pd
from FeatureAnalysis import feature_analysis


In [13]:
data = [1.0,2,3,4,5,6,4,3,2,1,2,9,10,100,np.nan,0,7,8,10,6]
label = [0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1]


In [15]:
assert len(data)==len(label)


In [16]:
fa = feature_analysis()


In [18]:
# 离群值分析
fa.outliers_analysis(data,alpha = 2)


Unnamed: 0,med,seg_25,seg_75,up_limit,low_limit,up_ratio,low_ratio
0,4.0,2.0,7.5,18.5,-9.0,0.052632,0.0


In [19]:
# 去除离群值
fa.drop_outliers(data,data,alpha = 2)


[1.0, 2, 3, 4, 5, 6, 4, 3, 2, 1, 2, 9, 10, nan, nan, 0, 7, 8, 10, 6]

In [21]:
# 基本分析
fa.basic_analysis(data,label).T


Unnamed: 0,0
not_nan_ratio,0.95
not_zero_ratio,0.9
not_outlier_ratio,0.9
class_num,12.0
value_num,19.0
min,0.0
mean,9.631579
med,4.0
most,2.0
max,100.0


In [22]:
# psi稳定性分析
test_data = [10,9,5,3,4,3,2,1,6,7,5,np.nan,10,100]
fa.psi_analysis(data,test_data)


Unnamed: 0,psi,is_stable,train_class_num,test_class_num,train_value_num,test_value_num
0,1.215559,0,12,10,19,13


In [31]:
# ks有效性分析,主要对连续特征，对离散特征也可分析
dfks = fa.ks_analysis(data,label)
dfks.iloc[:,0:len(dfks)/2]


Unnamed: 0,feature_interval,order_num,overdue_num,overdue_ratio
0,"[0.0,1.0)",1.0,0.0,0.0
1,"[1.0,2.0)",2.0,1.0,0.5
2,"[2.0,3.0)",3.0,1.0,0.333333
3,"[3.0,4.0)",2.0,1.0,0.5
4,"[4.0,6.0)",3.0,0.0,0.0
5,"[6.0,7.0)",2.0,1.0,0.5
6,"[7.0,9.0)",2.0,0.0,0.0
7,"[9.0,10.0)",1.0,1.0,1.0
8,"[10.0,100.0]",3.0,2.0,0.666667


In [27]:
dfks.iloc[:,len(dfks)/2:]


Unnamed: 0,normal_num,normal_ratio,overdue_cum_ratio,normal_cum_ratio,ks_value
0,1.0,1.0,0.0,0.083333,0.083333
1,1.0,0.5,0.142857,0.166667,0.02381
2,2.0,0.666667,0.285714,0.333333,0.047619
3,1.0,0.5,0.428571,0.416667,0.011905
4,3.0,1.0,0.428571,0.666667,0.238095
5,1.0,0.5,0.571429,0.75,0.178571
6,2.0,1.0,0.571429,0.916667,0.345238
7,0.0,0.0,0.714286,0.916667,0.202381
8,1.0,0.333333,1.0,1.0,0.0


In [30]:
# 卡方及召回率等分析，主要针对离散特征
fa.chi2_analysis(data,label).T


Unnamed: 0,0
TP,7.0
FP,11.0
TN,1.0
FN,0.0
TPR,1.0
FPR,0.9166667
overdue_ratio_0,0.0
overdue_ratio_1,0.3888889
precision,0.3888889
accuracy,0.4210526


### 三，多特征分析示范

In [3]:
import numpy as np
import pandas as pd
from FeatureAnalysis import FeatureAnalysis


In [4]:
# 构造dftrain 训练集特征数据
dftrain = pd.DataFrame()
dftrain['phone'] = ['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','x11','x12']
dftrain['loan_dt'] = ['2018-01-01']*12
dftrain['label'] = [0,1,1,0,1,0,0,0,0,0,1,0]
dftrain['feature1'] = [1,0,1,0,1,0,1,0,1,0,1,1]
dftrain['feature2'] = [1.0,2,3,4,5,6,7,8,9,10,11,12]
dftrain


Unnamed: 0,phone,loan_dt,label,feature1,feature2
0,x1,2018-01-01,0,1,1.0
1,x2,2018-01-01,1,0,2.0
2,x3,2018-01-01,1,1,3.0
3,x4,2018-01-01,0,0,4.0
4,x5,2018-01-01,1,1,5.0
5,x6,2018-01-01,0,0,6.0
6,x7,2018-01-01,0,1,7.0
7,x8,2018-01-01,0,0,8.0
8,x9,2018-01-01,0,1,9.0
9,x10,2018-01-01,0,0,10.0


In [5]:
# 构造dftest测试集特征
dftest = pd.DataFrame()
dftest['phone'] = ['y1','y2','y3','y4','y5','y6','y7','y8','y9','y10']
dftest['loan_dt'] = ['2018-02-01']*10
dftest['label'] = [1,0,0,1,0,0,0,1,0,0]
dftest['feature1'] = [1,0,0,1,0,0,1,0,1,0]
dftest['feature2'] = [10.0,9,8,7,6,5,4,3,2,1]
dftest


Unnamed: 0,phone,loan_dt,label,feature1,feature2
0,y1,2018-02-01,1,1,10.0
1,y2,2018-02-01,0,0,9.0
2,y3,2018-02-01,0,0,8.0
3,y4,2018-02-01,1,1,7.0
4,y5,2018-02-01,0,0,6.0
5,y6,2018-02-01,0,0,5.0
6,y7,2018-02-01,0,1,4.0
7,y8,2018-02-01,1,0,3.0
8,y9,2018-02-01,0,1,2.0
9,y10,2018-02-01,0,0,1.0


In [6]:
FA = FeatureAnalysis(dftrain,dftest)


In [7]:
#特征基本分析
FA.BasicAnalysis().T


Unnamed: 0,0,1
feature_name,feature1,feature2
not_nan_ratio,1,1
not_zero_ratio,0.5,1
not_outlier_ratio,1,1
class_num,2,12
value_num,22,22
min,0,1
mean,0.5,6.04545
med,0.5,6
most,0,1


In [8]:
#特征稳定性分析
FA.PsiAnalysis().T


Unnamed: 0,0,1
feature_name,feature1,feature2
psi,0.136022,3.56636
is_stable,1,0
train_class_num,2,12
test_class_num,2,10
train_value_num,12,12
test_value_num,10,10


In [10]:
#特征ks分析
dfks = FA.KsAnalysis()
dfks.iloc[:,0:len(dfks)/2]


Unnamed: 0,Unnamed: 1,feature_interval,order_num,overdue_num,overdue_ratio,normal_num,normal_ratio
feature1,0,"[0,0.5)",11.0,2.0,0.181818,9.0,0.818182
feature1,1,"[0.5,1]",11.0,5.0,0.454545,6.0,0.545455
feature2,0,"[1.0,2.0)",2.0,0.0,0.0,2.0,1.0
feature2,1,"[2.0,3.0)",2.0,1.0,0.5,1.0,0.5
feature2,2,"[3.0,4.0)",2.0,2.0,1.0,0.0,0.0
feature2,3,"[4.0,5.0)",2.0,0.0,0.0,2.0,1.0
feature2,4,"[5.0,6.0)",2.0,1.0,0.5,1.0,0.5
feature2,5,"[6.0,7.0)",2.0,0.0,0.0,2.0,1.0
feature2,6,"[7.0,8.0)",2.0,1.0,0.5,1.0,0.5
feature2,7,"[8.0,9.0)",2.0,0.0,0.0,2.0,1.0


In [11]:
dfks.iloc[:,len(dfks)/2:]


Unnamed: 0,Unnamed: 1,overdue_cum_ratio,normal_cum_ratio,ks_value
feature1,0,0.285714,0.6,0.314286
feature1,1,1.0,1.0,0.0
feature2,0,0.0,0.133333,0.133333
feature2,1,0.142857,0.2,0.057143
feature2,2,0.428571,0.2,0.228571
feature2,3,0.428571,0.333333,0.095238
feature2,4,0.571429,0.4,0.171429
feature2,5,0.571429,0.533333,0.038095
feature2,6,0.714286,0.6,0.114286
feature2,7,0.714286,0.733333,0.019048


In [18]:
#特征chi2分析
FA.Chi2Analysis().T


Unnamed: 0,feature1
TP,5.0
FP,6.0
TN,9.0
FN,2.0
TPR,0.714286
FPR,0.4
overdue_ratio_0,0.181818
overdue_ratio_1,0.454545
precision,0.454545
accuracy,0.636364
