In [1]:
import numpy as np     # 用来做数学运算
import pandas as pd    # 用来处理数据表

# 将所有图都在 Notebook 里显示
%matplotlib inline               
import matplotlib.pyplot as plt  # 用来画图
from sklearn.model_selection import train_test_split    # 做交叉验证，划分训练集和测试集
from sklearn.tree import DecisionTreeClassifier         # 决策树分类器
from sklearn.ensemble import GradientBoostingClassifier # 提升树分类器
from sklearn.ensemble import RandomForestClassifier     # 随机森林分类器

  from numpy.core.umath_tests import inner1d


In [2]:
loans = pd.read_csv('lending-club-data.csv', low_memory=False)
loans.head(3).append(loans.tail(3))

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
122604,9695736,11547808,8525,8525,8525,60 months,18.25,217.65,D,D3,...,0.6,0.0,1.0,1.0,0,6.95812,20190101T000000,0,1,0
122605,9684700,11536848,22000,22000,22000,60 months,19.97,582.5,D,D5,...,1.0,1.0,0.0,1.0,0,8.96154,20190101T000000,1,0,1
122606,9604874,11457002,2000,2000,2000,36 months,7.9,62.59,A,A4,...,0.8,0.0,1.0,1.0,0,0.904916,20170101T000000,0,1,1


In [3]:
loans.columns.values

array(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'is_inc_v', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc',
       'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
       'recoveries', 'collection_recovery_fee', 'last_pymnt_d',
       'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans',
       'bad_loans', 'emp_length_num', 'grade_num', '

In [4]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.drop('bad_loans', 1)

In [5]:
target = 'safe_loans'
features = ['grade',                     # grade of the loan (categorical)
            'sub_grade_num',             # sub-grade of the loan as a number from 0 to 1
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'payment_inc_ratio',         # ratio of the monthly payment to income
            'delinq_2yrs',               # number of delinquincies 
            'delinq_2yrs_zero',          # no delinquincies in last 2 years
            'inq_last_6mths',            # number of creditor inquiries in last 6 months
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'open_acc',                  # number of open credit accounts
            'pub_rec',                   # number of derogatory public records
            'pub_rec_zero',              # no derogatory public records
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'int_rate',                  # interest rate of the loan
            'total_rec_int',             # interest received to date
            'annual_inc',                # annual income of borrower
            'funded_amnt',               # amount committed to the loan
            'funded_amnt_inv',           # amount committed by investors for the loan
            'installment',               # monthly payment owed by the borrower
           ]

In [6]:
num_of_all_loans = len(loans)
loans = loans[[target] + features].dropna()
num_of_na = num_of_all_loans - len(loans)

print( 'Dropping %s observations; keeping %s ' % (num_of_na, len(loans)) )

Dropping 29 observations; keeping 122578 


In [7]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]

# Since there are fewer risky loans than safe loans, find the ratio of 
# the sizes and use that percentage to undersample the safe loans.
ratio = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(frac=ratio, random_state=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [8]:
N1 = len(safe_loans)
N2 = len(risky_loans)
N = N1 + N2
print( "%% of safe loans  : %.2f%%" %(N1/N*100.0) )
print( "%% of risky loans : %.2f%%" %(N2/N*100.0) )
print( "Total number of loans in our new dataset :", N )

% of safe loans  : 50.00%
% of risky loans : 50.00%
Total number of loans in our new dataset : 46294


In [9]:
loans = loans_data # keep the original form

In [10]:
categorical_variables = []
for feat_name, feat_type in zip(loans_data.columns.values,loans_data.dtypes):
    if feat_type == object:
        categorical_variables.append(feat_name)

categorical_variables

['grade', 'home_ownership', 'purpose']

In [11]:
for feature in categorical_variables:
    feat_value = loans_data[feature].unique()
    loans_data_one_hot_encoded = pd.DataFrame()
    for val in feat_value:
        label = feature + '.' + val
        loans_data_one_hot_encoded[label] = loans_data[feature].apply(lambda x: 1 if x == val else 0)
    loans_data = pd.concat([loans_data, loans_data_one_hot_encoded], axis=1)
loans_data = loans_data.drop(categorical_variables,axis=1)

loans_data.head(3).append(loans_data.tail(3))

Unnamed: 0,safe_loans,sub_grade_num,short_emp,emp_length_num,dti,payment_inc_ratio,delinq_2yrs,delinq_2yrs_zero,inq_last_6mths,last_delinq_none,...,purpose.other,purpose.debt_consolidation,purpose.major_purchase,purpose.credit_card,purpose.home_improvement,purpose.moving,purpose.house,purpose.medical,purpose.wedding,purpose.vacation
1,-1,0.8,1,1,1.0,2.3932,0.0,1.0,5.0,1,...,0,0,0,0,0,0,0,0,0,0
6,-1,0.4,0,5,5.55,4.5717,0.0,1.0,2.0,1,...,0,0,0,0,0,0,0,0,0,0
7,-1,1.0,1,1,18.08,9.716,0.0,1.0,0.0,1,...,1,0,0,0,0,0,0,0,0,0
56203,1,0.2,0,3,15.77,5.557,1.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
17572,1,0.2,0,5,13.89,7.18783,0.0,1.0,0.0,0,...,0,1,0,0,0,0,0,0,0,0
98495,1,0.6,0,9,8.14,8.60357,0.0,1.0,0.0,1,...,0,0,0,1,0,0,0,0,0,0


In [12]:
features = loans_data.columns.values
features = features[features != target]
features

array(['sub_grade_num', 'short_emp', 'emp_length_num', 'dti',
       'payment_inc_ratio', 'delinq_2yrs', 'delinq_2yrs_zero',
       'inq_last_6mths', 'last_delinq_none', 'last_major_derog_none',
       'open_acc', 'pub_rec', 'pub_rec_zero', 'revol_util',
       'total_rec_late_fee', 'int_rate', 'total_rec_int', 'annual_inc',
       'funded_amnt', 'funded_amnt_inv', 'installment', 'grade.C',
       'grade.F', 'grade.B', 'grade.D', 'grade.A', 'grade.E', 'grade.G',
       'home_ownership.RENT', 'home_ownership.OWN',
       'home_ownership.MORTGAGE', 'home_ownership.OTHER', 'purpose.car',
       'purpose.small_business', 'purpose.other',
       'purpose.debt_consolidation', 'purpose.major_purchase',
       'purpose.credit_card', 'purpose.home_improvement',
       'purpose.moving', 'purpose.house', 'purpose.medical',
       'purpose.wedding', 'purpose.vacation'], dtype=object)

In [13]:
print( "# of features (after one-hot encoding) = %s" % len(features) )

# of features (after one-hot encoding) = 44


In [14]:
(train_data, validation_data) = train_test_split( loans_data, 
                             train_size=0.8, random_state=1 )
print( train_data.shape, validation_data.shape )
X = train_data[features]
Y = train_data[target]

(37035, 45) (9259, 45)




In [15]:
GBT_5 = GradientBoostingClassifier(n_estimators=5, max_depth=6)
GBT_5 = GBT_5.fit(X, Y)
print( GBT_5 )

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=5, presort='auto',
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


In [16]:
RF_5 = RandomForestClassifier(n_estimators=5, max_depth=6)
RF_5 = RF_5.fit(X, Y)
print( RF_5 )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [17]:
# Select all positive and negative examples.
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

# Select 2 examples from the validation set for positive & negative loans
sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

# Append the 4 examples into a single dataset
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,safe_loans,sub_grade_num,short_emp,emp_length_num,dti,payment_inc_ratio,delinq_2yrs,delinq_2yrs_zero,inq_last_6mths,last_delinq_none,...,purpose.other,purpose.debt_consolidation,purpose.major_purchase,purpose.credit_card,purpose.home_improvement,purpose.moving,purpose.house,purpose.medical,purpose.wedding,purpose.vacation
93539,1,0.6,0,5,31.7,13.309,0.0,1.0,0.0,0,...,0,1,0,0,0,0,0,0,0,0
10285,1,0.6,0,4,10.8,8.19823,0.0,1.0,0.0,1,...,0,1,0,0,0,0,0,0,0,0
121325,-1,0.4,0,6,23.57,6.50943,0.0,1.0,1.0,1,...,0,1,0,0,0,0,0,0,0,0
74825,-1,1.0,0,7,18.56,19.0867,0.0,1.0,0.0,1,...,0,1,0,0,0,0,0,0,0,0


In [18]:
DT = DecisionTreeClassifier(max_depth=6)
DT = DT.fit(X, Y)
print(DT)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [20]:
print(DT.score(X, Y))
print(GBT_5.score(X, Y))
print(RF_5.score(X, Y))

0.6548130147158093
0.665883623599298
0.6525448899689483
