In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_row", 160)
pd.set_option('max_colwidth', 200)

In [3]:
data = pd.read_csv('../../Data/20x45B_All.csv')
data.head(5)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_low,fico_range_high,int_rate,installment,open_acc,total_acc,revol_bal,inq_last_6mths,delinq_2yrs,acc_now_delinq,collections_12_mths_ex_med,pub_rec,months_cr_line,term,initial_list_status,grade,sub_grade,emp_length,ANY,MORTGAGE,NONE,OTHER,OWN,RENT,car,credit_card,debt_consolidation,educational,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding,Not Verified,Source Verified,Verified,loan_status_bin
0,-0.805788,0.544423,0.5703,0.663197,0.663194,-0.388527,-0.928141,0.020404,0.814165,0.277188,1.683433,0.681092,-0.074467,-0.125292,-0.377704,0.341665,0.729709,1.019288,0.837092,0.319365,-0.684242,-0.017646,1.099265,-0.007523,-0.007523,-0.358603,-0.873547,-0.084014,-0.342123,-1.311086,-0.003071,-0.252444,-0.117473,6.888409,-0.123241,-0.114845,-0.323001,-0.037391,-0.171212,-0.082752,-0.052011,-0.372811,1.298703,-1.011585,1.0
1,0.650069,0.015948,0.055254,-1.040497,-1.040493,0.282002,0.487904,1.071777,-0.307337,0.416038,-0.902143,-0.383738,-0.074467,-0.125292,-0.377704,-0.928019,0.729709,1.019288,0.837092,0.871554,-0.427946,-0.017646,1.099265,-0.007523,-0.007523,-0.358603,-0.873547,-0.084014,-0.342123,0.762727,-0.003071,-0.252444,-0.117473,-0.145171,-0.123241,-0.114845,-0.323001,-0.037391,-0.171212,-0.082752,-0.052011,2.682328,-0.769999,-1.011585,0.0
2,-0.232905,-0.522707,1.648144,3.583815,3.583802,-0.133801,-0.383561,1.071777,-0.067015,-0.358038,-0.040284,-0.383738,-0.074467,-0.125292,-0.377704,-0.892085,0.729709,-0.981077,0.837092,0.595459,1.109835,-0.017646,-0.909699,-0.007523,-0.007523,2.788603,-0.873547,-0.084014,-0.342123,0.762727,-0.003071,-0.252444,-0.117473,-0.145171,-0.123241,-0.114845,-0.323001,-0.037391,-0.171212,-0.082752,-0.052011,2.682328,-0.769999,-1.011585,1.0
3,0.529186,0.644379,0.008638,-1.040497,-1.040493,-1.118991,0.203558,-0.154825,-0.067015,0.212601,-0.902143,-0.383738,-0.074467,-0.125292,-0.377704,2.042563,0.729709,1.019288,-0.426533,-0.232824,-0.940539,-0.017646,-0.909699,-0.007523,-0.007523,2.788603,-0.873547,11.902814,-0.342123,-1.311086,-0.003071,-0.252444,-0.117473,-0.145171,-0.123241,-0.114845,-0.323001,-0.037391,-0.171212,-0.082752,-0.052011,2.682328,-0.769999,-1.011585,0.0
4,1.780067,0.823931,1.743649,0.906582,0.906578,-1.118991,1.300984,2.298379,2.336203,-0.106754,-0.902143,0.681092,-0.074467,-0.125292,-0.377704,0.868704,0.729709,1.019288,-0.426533,-0.232824,0.597242,-0.017646,1.099265,-0.007523,-0.007523,-0.358603,-0.873547,-0.084014,-0.342123,0.762727,-0.003071,-0.252444,-0.117473,-0.145171,-0.123241,-0.114845,-0.323001,-0.037391,-0.171212,-0.082752,-0.052011,-0.372811,1.298703,-1.011585,1.0


In [4]:
data.shape

(106009, 45)

In [5]:
data.loan_status_bin.value_counts()

1.0    62957
0.0    43052
Name: loan_status_bin, dtype: int64

In [6]:
#Null Model (or random investment) will yield, 59.3% of picking a good loan: 
data.loan_status_bin.value_counts()[1] / len(data)

0.5938835381901537

In [7]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(106009, 44)

In [8]:
y = data['loan_status_bin']

In [9]:
n_features = 10
lr = LogisticRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=n_features)

In [10]:
sfs.fit(X,y)

In [11]:
#sfs.get_support()

In [12]:
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(X.columns[sfs.get_support()])))


Top 10 features selected by forward sequential selection:['annual_inc', 'dti', 'fico_range_low', 'installment', 'revol_bal', 'term', 'grade', 'sub_grade', 'RENT', 'credit_card']


In [43]:
feat = list(X.columns[sfs.get_support()])
feat

['dti', 'grade', 'RENT']

In [29]:
X_feat = data[feat]

In [30]:
y = data['loan_status_bin']
y

0         1.0
1         0.0
2         1.0
3         0.0
4         1.0
         ... 
106004    0.0
106005    1.0
106006    1.0
106007    1.0
106008    1.0
Name: loan_status_bin, Length: 106009, dtype: float64

In [31]:
lr_check = LogisticRegression()
steps = [('model', lr_check)]
pipeline = Pipeline(steps=steps)

In [32]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.59989624 0.59805679 0.60178285 0.60145269 0.6015754 ]
0.6005527929006451


In [33]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
steps2 = [('model', ada)]
pipeline2 = Pipeline(steps=steps2)

In [34]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline2, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.60140553 0.59909443 0.6022545  0.60300915 0.60138673]
0.6014300668496089


In [35]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1, max_iter=1000)
steps3 = [('model', nn)]
pipeline3 = Pipeline(steps=steps3)

In [36]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline3, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.60121687 0.59824545 0.60187718 0.60022639 0.60124522]
0.60056222285845


In [37]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
steps4 = [('model', gb)]
pipeline4 = Pipeline(steps=steps4)

In [38]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline4, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.60126403 0.59890576 0.60168852 0.60267899 0.60096222]
0.6010999053128834
