In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_row", 160)
pd.set_option('max_colwidth', 200)

In [4]:
data = pd.read_csv('../../Data/20x43_D1060.csv')

In [5]:
data.head(5)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_low,fico_range_high,int_rate,installment,open_acc,total_acc,revol_bal,inq_last_6mths,delinq_2yrs,acc_now_delinq,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,pub_rec,months_cr_line,term,initial_list_status,sub_grade,emp_length,ANY,MORTGAGE,OWN,RENT,car,credit_card,debt_consolidation,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding,Not Verified,Source Verified,Verified,loan_status_bin
0,-0.27494,-0.184102,1.311385,-0.819734,-0.819734,-0.643016,-0.28462,-0.307975,-1.102652,-0.320978,-0.916391,-0.442188,-0.056501,-0.161202,-0.046841,-0.78292,1.081878,-1.829511,-0.069254,0.876318,-1.305077,0.970118,-0.028217,-0.952594,-0.369497,1.216261,-0.156365,-0.320362,-0.858014,-0.287554,-0.120532,-0.194898,-0.199254,5.189733,-0.513144,-0.048912,-0.133468,-0.164197,-0.056501,-0.680683,-0.770166,1.488232,0.0
1,-1.703308,-0.172927,-0.550169,-0.538202,-0.538202,-0.643016,-1.70467,-0.833958,-0.301199,-0.284038,-0.916391,-0.442188,-0.056501,-0.161202,-0.107754,-0.846388,-0.354152,-0.248663,-0.069254,0.876318,-1.305077,-0.087275,-0.028217,-0.952594,-0.369497,1.216261,-0.156365,-0.320362,-0.858014,-0.287554,-0.120532,-0.194898,-0.199254,-0.192688,1.948771,-0.048912,-0.133468,-0.164197,-0.056501,1.469112,-0.770166,-0.671938,0.0
2,-0.188373,0.05058,-0.932357,-0.819734,-0.819734,1.336744,-0.155399,-0.132647,-0.221053,-0.503128,-0.045057,-0.442188,-0.056501,-0.161202,-0.107754,-0.843614,3.953939,0.407996,-0.069254,0.876318,0.766238,-1.673364,-0.028217,-0.952594,-0.369497,1.216261,-0.156365,-0.320362,-0.858014,3.477607,-0.120532,-0.194898,-0.199254,-0.192688,-0.513144,-0.048912,-0.133468,-0.164197,-0.056501,-0.680683,1.298421,-0.671938,1.0
3,0.255287,-0.161752,-0.160924,0.587924,0.587924,1.336744,0.294572,-0.132647,-0.942361,-0.561085,-0.916391,-0.442188,-0.056501,-0.161202,-0.107754,-0.695133,-0.354152,-0.418909,-0.069254,0.876318,0.766238,-0.880319,-0.028217,-0.952594,-0.369497,1.216261,-0.156365,-0.320362,-0.858014,-0.287554,-0.120532,-0.194898,-0.199254,-0.192688,-0.513144,-0.048912,-0.133468,6.090231,-0.056501,1.469112,-0.770166,-0.671938,1.0
4,0.460885,-0.200865,0.964475,0.587924,0.587924,1.336744,0.50303,-0.132647,-0.461489,0.018166,-0.916391,-0.442188,-0.056501,-0.161202,-0.107754,0.480341,-0.354152,-0.637795,-0.069254,0.876318,0.766238,-1.409016,-0.028217,1.049765,-0.369497,-0.822192,6.395311,-0.320362,-0.858014,-0.287554,-0.120532,-0.194898,-0.199254,-0.192688,-0.513144,-0.048912,-0.133468,-0.164197,-0.056501,-0.680683,-0.770166,1.488232,0.0


In [6]:
data.shape

(1257, 43)

In [7]:
data.loan_status_bin.value_counts()

1.0    1032
0.0     225
Name: loan_status_bin, dtype: int64

In [8]:
#Null Model (or random investment) will yield, 59.3% of picking a good loan: 
data.loan_status_bin.value_counts()[1] / len(data)

0.8210023866348448

In [9]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(1257, 42)

In [10]:
y = data['loan_status_bin']

In [15]:
n_features = 3
lr = LogisticRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=n_features)

In [16]:
sfs.fit(X,y)

In [17]:
#sfs.get_support()

In [18]:
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(X.columns[sfs.get_support()])))


Top 3 features selected by forward sequential selection:['loan_amnt', 'annual_inc', 'dti']


In [19]:
feat = list(X.columns[sfs.get_support()])
feat

['loan_amnt', 'annual_inc', 'dti']

In [20]:
X_feat = data[feat]

In [21]:
y = data['loan_status_bin']
y

0       0.0
1       0.0
2       1.0
3       1.0
4       0.0
       ... 
1252    0.0
1253    0.0
1254    1.0
1255    1.0
1256    0.0
Name: loan_status_bin, Length: 1257, dtype: float64

In [22]:
lr_check = LogisticRegression()
steps = [('model', lr_check)]
pipeline = Pipeline(steps=steps)

In [23]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.82142857 0.82142857 0.82071713 0.82071713 0.82071713]
0.8210017074558907


In [24]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
steps2 = [('model', ada)]
pipeline2 = Pipeline(steps=steps2)

In [25]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline2, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.82539683 0.81746032 0.81673307 0.79681275 0.80876494]
0.8130335799658509


In [26]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1, max_iter=1000)
steps3 = [('model', nn)]
pipeline3 = Pipeline(steps=steps3)

In [27]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline3, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.82142857 0.82142857 0.82071713 0.82071713 0.82071713]
0.8210017074558907


In [28]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
steps4 = [('model', gb)]
pipeline4 = Pipeline(steps=steps4)

In [29]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline4, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.82142857 0.80952381 0.80079681 0.78884462 0.812749  ]
0.8066685638398786
