In [8]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [9]:
# 设置字符集，防止图片中的中文乱码
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False

In [10]:
# 设置jupyter图片显示方式
%matplotlib tk

In [11]:
# 数据读取
df = pd.read_csv('./data/LoanStats3a.csv', skiprows=1, low_memory=False)
df.head(3)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,Cash,N,,,,,,
1,,,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,Cash,N,,,,,,
2,,,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,Cash,N,,,,,,


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42538 entries, 0 to 42537
Columns: 145 entries, id to settlement_term
dtypes: float64(115), object(30)
memory usage: 47.1+ MB


In [13]:
# 删除id和member_id
df.drop('id', 1, inplace=True)
df.drop('member_id', 1, inplace=True)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42538 entries, 0 to 42537
Columns: 143 entries, loan_amnt to settlement_term
dtypes: float64(114), object(29)
memory usage: 46.4+ MB


In [15]:
df.head(3)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,,10+ years,...,,,Cash,N,,,,,,
1,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,Ryder,< 1 year,...,,,Cash,N,,,,,,
2,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,,10+ years,...,,,Cash,N,,,,,,


In [16]:
# 将列int_rate中的%去掉， 并转换为数字
df.int_rate = pd.Series(df.int_rate).str.replace('%', '').astype(float)
df.head(3)


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,,10+ years,...,,,Cash,N,,,,,,
1,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,Ryder,< 1 year,...,,,Cash,N,,,,,,
2,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,,10+ years,...,,,Cash,N,,,,,,


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42538 entries, 0 to 42537
Columns: 143 entries, loan_amnt to settlement_term
dtypes: float64(115), object(28)
memory usage: 46.4+ MB


In [18]:
# 删除df中行或者列中所有值为nan的
df.dropna(axis=0, how='all', inplace=True)
df.dropna(axis=1, how='all', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42535 entries, 0 to 42535
Data columns (total 63 columns):
loan_amnt                     42535 non-null float64
funded_amnt                   42535 non-null float64
funded_amnt_inv               42535 non-null float64
term                          42535 non-null object
int_rate                      42535 non-null float64
installment                   42535 non-null float64
grade                         42535 non-null object
sub_grade                     42535 non-null object
emp_title                     39909 non-null object
emp_length                    41423 non-null object
home_ownership                42535 non-null object
annual_inc                    42531 non-null float64
verification_status           42535 non-null object
issue_d                       42535 non-null object
loan_status                   42535 non-null object
pymnt_plan                    42535 non-null object
desc                          29242 non-null object
p

In [19]:
# 查看一部分数据, 前5行的前7列的数据
df.ix[:5, :7]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade
0,5000.0,5000.0,4975.0,36 months,10.65,162.87,B
1,2500.0,2500.0,2500.0,60 months,15.27,59.83,C
2,2400.0,2400.0,2400.0,36 months,15.96,84.33,C
3,10000.0,10000.0,10000.0,36 months,13.49,339.31,C
4,3000.0,3000.0,3000.0,60 months,12.69,67.79,B
5,5000.0,5000.0,5000.0,36 months,7.9,156.46,A


In [20]:
df.ix[:5, 7:12]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,sub_grade,emp_title,emp_length,home_ownership,annual_inc
0,B2,,10+ years,RENT,24000.0
1,C4,Ryder,< 1 year,RENT,30000.0
2,C5,,10+ years,RENT,12252.0
3,C1,AIR RESOURCES BOARD,10+ years,RENT,49200.0
4,B5,University Medical Group,1 year,RENT,80000.0
5,A4,Veolia Transportaton,3 years,RENT,36000.0


In [21]:
# emp_title: employment title: 参考垃圾邮件过滤中介绍到的那个接收邮件的特征的处理方式，直接删除emp_title
print(df.emp_title.value_counts().head())
print(df.emp_title.unique().shape)
# tdf = df[['emp_title', 'loan_status']].groupby(['emp_title', 'loan_status'])['loan_status'].count()
# print(tdf)

US Army              139
Bank of America      115
IBM                   72
Kaiser Permanente     61
AT&T                  61
Name: emp_title, dtype: int64
(30659,)


In [22]:
# 删除emp_title
df.drop(['emp_title'], 1, inplace=True)

In [23]:
df.ix[:5, 7:12]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,sub_grade,emp_length,home_ownership,annual_inc,verification_status
0,B2,10+ years,RENT,24000.0,Verified
1,C4,< 1 year,RENT,30000.0,Source Verified
2,C5,10+ years,RENT,12252.0,Not Verified
3,C1,10+ years,RENT,49200.0,Source Verified
4,B5,1 year,RENT,80000.0,Source Verified
5,A4,3 years,RENT,36000.0,Source Verified


In [24]:
# emp_length
print(df.emp_length.value_counts().head(12))
print(df.emp_length.unique().shape)

10+ years    9369
< 1 year     5062
2 years      4743
3 years      4364
4 years      3649
1 year       3595
5 years      3458
6 years      2375
7 years      1875
8 years      1592
9 years      1341
Name: emp_length, dtype: int64
(12,)


In [25]:
df.replace('n/a', np.nan, inplace=True)
df.emp_length.fillna(value=0, inplace=True)
df['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
df['emp_length'] = df['emp_length'].astype(int)
df.emp_length.value_counts()

10    9369
1     8657
2     4743
3     4364
4     3649
5     3458
6     2375
7     1875
8     1592
9     1341
0     1112
Name: emp_length, dtype: int64

In [26]:
df.ix[:5, 7:12]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,sub_grade,emp_length,home_ownership,annual_inc,verification_status
0,B2,10,RENT,24000.0,Verified
1,C4,1,RENT,30000.0,Source Verified
2,C5,10,RENT,12252.0,Not Verified
3,C1,10,RENT,49200.0,Source Verified
4,B5,1,RENT,80000.0,Source Verified
5,A4,3,RENT,36000.0,Source Verified


In [27]:
# 因为verification_status特征取值只有三种情况，那么不需要再处理该特征
df.verification_status.value_counts()

Not Verified       18758
Verified           13471
Source Verified    10306
Name: verification_status, dtype: int64

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42535 entries, 0 to 42535
Data columns (total 62 columns):
loan_amnt                     42535 non-null float64
funded_amnt                   42535 non-null float64
funded_amnt_inv               42535 non-null float64
term                          42535 non-null object
int_rate                      42535 non-null float64
installment                   42535 non-null float64
grade                         42535 non-null object
sub_grade                     42535 non-null object
emp_length                    42535 non-null int32
home_ownership                42535 non-null object
annual_inc                    42531 non-null float64
verification_status           42535 non-null object
issue_d                       42535 non-null object
loan_status                   42535 non-null object
pymnt_plan                    42535 non-null object
desc                          29242 non-null object
purpose                       42535 non-null object
ti

In [29]:
df.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'desc', 'purpose', 'title', 'zip_code', 'addr_state',
       'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt',
       'pub_rec_bankruptcies', 'tax_liens', 'hardship_flag',
       'disbursement_

In [30]:
# 计算object类型的所有列的对应数据的count值
for col in df.select_dtypes(include=['object']).columns:
    print('列{}具有{}个不同的value值'.format(col, len(df[col].unique())))

列term具有2个不同的value值
列grade具有7个不同的value值
列sub_grade具有35个不同的value值
列home_ownership具有5个不同的value值
列verification_status具有3个不同的value值
列issue_d具有55个不同的value值
列loan_status具有4个不同的value值
列pymnt_plan具有1个不同的value值
列desc具有28964个不同的value值
列purpose具有14个不同的value值
列title具有21265个不同的value值
列zip_code具有837个不同的value值
列addr_state具有50个不同的value值
列earliest_cr_line具有531个不同的value值
列revol_util具有1120个不同的value值
列initial_list_status具有1个不同的value值
列last_pymnt_d具有113个不同的value值
列next_pymnt_d具有99个不同的value值
列last_credit_pull_d具有125个不同的value值
列application_type具有1个不同的value值
列hardship_flag具有1个不同的value值
列disbursement_method具有1个不同的value值
列debt_settlement_flag具有2个不同的value值
列debt_settlement_flag_date具有2个不同的value值
列settlement_status具有4个不同的value值
列settlement_date具有58个不同的value值


In [31]:
df.revol_util.value_counts()
df.next_pymnt_d.value_counts()
df.purpose.value_counts()

debt_consolidation    19776
credit_card            5477
other                  4425
home_improvement       3199
major_purchase         2311
small_business         1992
car                    1615
wedding                1004
medical                 753
moving                  629
house                   426
educational             422
vacation                400
renewable_energy        106
Name: purpose, dtype: int64

In [32]:
# 删除一些出现类别数目比较多的object类型的column列数据
df.drop(['sub_grade','issue_d','desc','title',
         'zip_code','addr_state','earliest_cr_line',
         'last_pymnt_d','next_pymnt_d','last_credit_pull_d',
         'settlement_date'], 1,  inplace=True)

In [33]:
# 将百分号去掉
df.revol_util = pd.Series(df.revol_util).str.replace('%', '').astype(float)

In [34]:
# 计算object类型的所有列的对应数据的count值
for col in df.select_dtypes(include=['object']).columns:
    print('列{}具有{}个不同的value值'.format(col, len(df[col].unique())))

列term具有2个不同的value值
列grade具有7个不同的value值
列home_ownership具有5个不同的value值
列verification_status具有3个不同的value值
列loan_status具有4个不同的value值
列pymnt_plan具有1个不同的value值
列purpose具有14个不同的value值
列initial_list_status具有1个不同的value值
列application_type具有1个不同的value值
列hardship_flag具有1个不同的value值
列disbursement_method具有1个不同的value值
列debt_settlement_flag具有2个不同的value值
列debt_settlement_flag_date具有2个不同的value值
列settlement_status具有4个不同的value值


In [35]:
# 处理缺省值的情况（处理object数据类型的列）
df.select_dtypes(include=['O']).describe().T.\
   assign(missing_pct=df.apply(lambda x: (len(x) - x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
term,42535,2,36 months,31534,0.0
grade,42535,7,B,12389,0.0
home_ownership,42535,5,RENT,20181,0.0
verification_status,42535,3,Not Verified,18758,0.0
loan_status,42535,4,Fully Paid,34116,0.0
pymnt_plan,42535,1,n,42535,0.0
purpose,42535,14,debt_consolidation,19776,0.0
initial_list_status,42535,1,f,42535,0.0
application_type,42535,1,Individual,42535,0.0
hardship_flag,42535,1,N,42535,0.0


In [36]:
# missing_pct： 删除缺省比较严重的特征
df.drop(['debt_settlement_flag_date'], 1, inplace=True)
df.drop(['settlement_status'], 1, inplace=True)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42535 entries, 0 to 42535
Data columns (total 49 columns):
loan_amnt                     42535 non-null float64
funded_amnt                   42535 non-null float64
funded_amnt_inv               42535 non-null float64
term                          42535 non-null object
int_rate                      42535 non-null float64
installment                   42535 non-null float64
grade                         42535 non-null object
emp_length                    42535 non-null int32
home_ownership                42535 non-null object
annual_inc                    42531 non-null float64
verification_status           42535 non-null object
loan_status                   42535 non-null object
pymnt_plan                    42535 non-null object
purpose                       42535 non-null object
dti                           42535 non-null float64
delinq_2yrs                   42506 non-null float64
inq_last_6mths                42506 non-null float64

In [38]:
# 贷后相关的字段删除
df.drop(['out_prncp','out_prncp_inv','total_pymnt',
         'total_pymnt_inv','total_rec_prncp', 'grade'] ,1, inplace=True)
df.drop(['total_rec_int','total_rec_late_fee',
         'recoveries','collection_recovery_fee',
         'collection_recovery_fee' ],1, inplace=True)
df.drop(['last_pymnt_amnt'],1, inplace=True)
df.drop(['policy_code'],1, inplace=True)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42535 entries, 0 to 42535
Data columns (total 37 columns):
loan_amnt                     42535 non-null float64
funded_amnt                   42535 non-null float64
funded_amnt_inv               42535 non-null float64
term                          42535 non-null object
int_rate                      42535 non-null float64
installment                   42535 non-null float64
emp_length                    42535 non-null int32
home_ownership                42535 non-null object
annual_inc                    42531 non-null float64
verification_status           42535 non-null object
loan_status                   42535 non-null object
pymnt_plan                    42535 non-null object
purpose                       42535 non-null object
dti                           42535 non-null float64
delinq_2yrs                   42506 non-null float64
inq_last_6mths                42506 non-null float64
mths_since_last_delinq        15609 non-null float6

In [40]:
# 处理缺省值的情况
df.select_dtypes(include=['float']).describe().T.\
   assign(missing_pct=df.apply(lambda x: (len(x) - x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
loan_amnt,42535.0,11089.722581,7410.938391,500.0,5200.0,9700.0,15000.0,35000.0,0.0
funded_amnt,42535.0,10821.585753,7146.914675,500.0,5000.0,9600.0,15000.0,35000.0,0.0
funded_amnt_inv,42535.0,10139.830603,7131.686447,0.0,4950.0,8500.0,14000.0,35000.0,0.0
int_rate,42535.0,12.165016,3.707936,5.42,9.63,11.99,14.72,24.59,0.0
installment,42535.0,322.623063,208.927216,15.67,165.52,277.69,428.18,1305.19,0.0
annual_inc,42531.0,69136.55642,64096.349719,1896.0,40000.0,59000.0,82500.0,6000000.0,9.4e-05
dti,42535.0,13.373043,6.726315,0.0,8.2,13.47,18.68,29.99,0.0
delinq_2yrs,42506.0,0.152449,0.512406,0.0,0.0,0.0,0.0,13.0,0.000682
inq_last_6mths,42506.0,1.081424,1.527455,0.0,0.0,1.0,2.0,33.0,0.000682
mths_since_last_delinq,15609.0,35.017618,22.418427,0.0,17.0,33.0,51.0,120.0,0.633032


In [41]:
# 删除缺失率比较高的特征
df.drop(['settlement_amount', 'settlement_percentage', 'settlement_term', 'mths_since_last_record'], 1, inplace=True)

In [42]:
# 处理缺省值的情况
df.select_dtypes(include=['int']).describe().T.\
   assign(missing_pct=df.apply(lambda x: (len(x) - x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
emp_length,42535.0,4.913389,3.461593,0.0,2.0,4.0,9.0,10.0,0.0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42535 entries, 0 to 42535
Data columns (total 33 columns):
loan_amnt                     42535 non-null float64
funded_amnt                   42535 non-null float64
funded_amnt_inv               42535 non-null float64
term                          42535 non-null object
int_rate                      42535 non-null float64
installment                   42535 non-null float64
emp_length                    42535 non-null int32
home_ownership                42535 non-null object
annual_inc                    42531 non-null float64
verification_status           42535 non-null object
loan_status                   42535 non-null object
pymnt_plan                    42535 non-null object
purpose                       42535 non-null object
dti                           42535 non-null float64
delinq_2yrs                   42506 non-null float64
inq_last_6mths                42506 non-null float64
mths_since_last_delinq        15609 non-null float6

In [44]:
# 查看最终目标属性（需要预测的指标）
df['loan_status'].value_counts()

Fully Paid                                             34116
Charged Off                                             5670
Does not meet the credit policy. Status:Fully Paid      1988
Does not meet the credit policy. Status:Charged Off      761
Name: loan_status, dtype: int64

In [45]:
# 替换目标属性的值，1表示好用户，0表示坏用户
df.loan_status.replace('Fully Paid', int(1), inplace=True)
df.loan_status.replace('Charged Off', int(0), inplace=True)
df.loan_status.replace('Does not meet the credit policy. Status:Fully Paid', np.nan, inplace=True)
df.loan_status.replace('Does not meet the credit policy. Status:Charged Off', np.nan, inplace=True)
df['loan_status'].value_counts()

1.0    34116
0.0     5670
Name: loan_status, dtype: int64

In [46]:
# 删除不要的样本
df.dropna(subset=['loan_status'], inplace=True)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39786 entries, 0 to 39785
Data columns (total 33 columns):
loan_amnt                     39786 non-null float64
funded_amnt                   39786 non-null float64
funded_amnt_inv               39786 non-null float64
term                          39786 non-null object
int_rate                      39786 non-null float64
installment                   39786 non-null float64
emp_length                    39786 non-null int32
home_ownership                39786 non-null object
annual_inc                    39786 non-null float64
verification_status           39786 non-null object
loan_status                   39786 non-null float64
pymnt_plan                    39786 non-null object
purpose                       39786 non-null object
dti                           39786 non-null float64
delinq_2yrs                   39786 non-null float64
inq_last_6mths                39786 non-null float64
mths_since_last_delinq        14059 non-null float

In [48]:
# 计算关联信息
cor = df.corr()
cor.loc[:,:] = np.tril(cor, k=-1)
cor = cor.stack()
cor[(cor>0.7)|(cor<-0.7)]

funded_amnt           loan_amnt          0.981544
funded_amnt_inv       loan_amnt          0.940157
                      funded_amnt        0.958564
installment           loan_amnt          0.930209
                      funded_amnt        0.956108
                      funded_amnt_inv    0.905098
pub_rec_bankruptcies  pub_rec            0.845864
dtype: float64

In [49]:
# 删除相关性比较强的特征属性： 因为如果存在多个特征之间是强相关的，那么其实可以用其中任意一个特征即即可得到特征属性和y值之间的映射关系
df.drop(['funded_amnt', 'funded_amnt_inv', 'installment'], 1, inplace=True)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39786 entries, 0 to 39785
Data columns (total 30 columns):
loan_amnt                     39786 non-null float64
term                          39786 non-null object
int_rate                      39786 non-null float64
emp_length                    39786 non-null int32
home_ownership                39786 non-null object
annual_inc                    39786 non-null float64
verification_status           39786 non-null object
loan_status                   39786 non-null float64
pymnt_plan                    39786 non-null object
purpose                       39786 non-null object
dti                           39786 non-null float64
delinq_2yrs                   39786 non-null float64
inq_last_6mths                39786 non-null float64
mths_since_last_delinq        14059 non-null float64
open_acc                      39786 non-null float64
pub_rec                       39786 non-null float64
revol_bal                     39786 non-null float

In [51]:
# 查看一下各个特征属性取值为空的样本数目
df.isnull().sum()

loan_amnt                         0
term                              0
int_rate                          0
emp_length                        0
home_ownership                    0
annual_inc                        0
verification_status               0
loan_status                       0
pymnt_plan                        0
purpose                           0
dti                               0
delinq_2yrs                       0
inq_last_6mths                    0
mths_since_last_delinq        25727
open_acc                          0
pub_rec                           0
revol_bal                         0
revol_util                       50
total_acc                         0
initial_list_status               0
collections_12_mths_ex_med       56
application_type                  0
acc_now_delinq                    0
chargeoff_within_12_mths         56
delinq_amnt                       0
pub_rec_bankruptcies            697
tax_liens                        39
hardship_flag               

In [52]:
# 1. 数值为空的进行填充，填充要不填充默认值，0/1; 要不中值，均值
df.fillna(0.0, inplace=True)
df.fillna(0, inplace=True)

In [53]:
# 计算object类型的所有列的对应数据的count值
for col in df.select_dtypes(include=['object']).columns:
    print('列{}具有{}个不同的value值, 取值分别为:{}'.format(col, len(df[col].unique()), df[col].unique()))

列term具有2个不同的value值, 取值分别为:[' 36 months' ' 60 months']
列home_ownership具有5个不同的value值, 取值分别为:['RENT' 'OWN' 'MORTGAGE' 'OTHER' 'NONE']
列verification_status具有3个不同的value值, 取值分别为:['Verified' 'Source Verified' 'Not Verified']
列pymnt_plan具有1个不同的value值, 取值分别为:['n']
列purpose具有14个不同的value值, 取值分别为:['credit_card' 'car' 'small_business' 'other' 'wedding'
 'debt_consolidation' 'home_improvement' 'major_purchase' 'medical'
 'moving' 'vacation' 'house' 'renewable_energy' 'educational']
列initial_list_status具有1个不同的value值, 取值分别为:['f']
列application_type具有1个不同的value值, 取值分别为:['Individual']
列hardship_flag具有1个不同的value值, 取值分别为:['N']
列disbursement_method具有1个不同的value值, 取值分别为:['Cash']
列debt_settlement_flag具有2个不同的value值, 取值分别为:['N' 'Y']


In [54]:
# 哑扁码操作
df = pd.get_dummies(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39786 entries, 0 to 39785
Data columns (total 51 columns):
loan_amnt                              39786 non-null float64
int_rate                               39786 non-null float64
emp_length                             39786 non-null int32
annual_inc                             39786 non-null float64
loan_status                            39786 non-null float64
dti                                    39786 non-null float64
delinq_2yrs                            39786 non-null float64
inq_last_6mths                         39786 non-null float64
mths_since_last_delinq                 39786 non-null float64
open_acc                               39786 non-null float64
pub_rec                                39786 non-null float64
revol_bal                              39786 non-null float64
revol_util                             39786 non-null float64
total_acc                              39786 non-null float64
collections_12_mths_ex_me

In [55]:
df.head(1).values

array([[5.0000e+03, 1.0650e+01, 1.0000e+01, 2.4000e+04, 1.0000e+00,
        2.7650e+01, 0.0000e+00, 1.0000e+00, 0.0000e+00, 3.0000e+00,
        0.0000e+00, 1.3648e+04, 8.3700e+01, 9.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
        1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        0.0000e+00]])

In [57]:
# 模型的输出
df.to_csv('./data/features01.csv', header=True, index=False)

## 特征工程总结
1. 主要目的：为了决策模型选择特征的一个过程，主要处理的数据就是原始的一些相关数据，主要数据格式：字符串、数值类型为主；首先处理做的特征选择，主要是对字符串类型的数字做操作；当所有的字符串类型数字处理完成后，再考虑数值类型的数字，在处理数值类型数字的时候，常用方式：连续数据区间化、标准化、归一化、正则化。
2. 如果原始数据是字符串类型的，那么该属性就有可能是两种情况：第一个：是一个类别信息，那需要做哑编码；第二个：字符串是一个文本信息，那就需要做词袋法、TFIDF或者Word2Vec转换。
3. 至于维度扩展、特征选择、降维主要按照以下方式来操作：
    1. 维度扩展：如果当前模型在第一步和第二步后的数据上训练效果不佳，那么可能是我们提取的特征有问题，那么可以考虑通过某种维度扩展方式提取更有效的特征数据，来训练模型。
    2. 特征选择、降维：如果当前模型在第一步和第二步后的数据集上训练效果还可以，但是测试效果不稳定或者训练速度比较慢，那这个有可能是因为存在异常的特征属性导致效果不稳定的或者存在太多的属性导致训练速度慢的，那这个时候就可以考虑做一个特征选择或者降维的操作 ------> 直白来讲，如果你的维度是上万维的，那么你可以考虑做一个特征选择，将维度变成千级别。如果维度是千级别的，那可以考虑做一个降维的操作，将维度变成百左右 ----> 实际工作中，一般最终输入到模型中的特征属性维度: 30~500左右。

### 特征工程的特征构建
1. 在特征工程过程中，其实是根据业务来构建一些特征信息，构建的这些特征信息，是你(编程人员)认为的可能会影响目标属性的特征