In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_row", 160)
pd.set_option('max_colwidth', 200)

In [3]:
data = pd.read_csv('../../Data/20x43_D106015.csv')

In [4]:
data.head(5)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_low,fico_range_high,int_rate,installment,open_acc,total_acc,revol_bal,revol_util,inq_last_6mths,delinq_2yrs,mths_since_last_delinq,acc_now_delinq,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,pub_rec,months_cr_line,term,initial_list_status,sub_grade,emp_length,MORTGAGE,OWN,RENT,car,credit_card,debt_consolidation,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,Not Verified,Source Verified,Verified,loan_status_bin
0,-1.710264,-0.170493,0.829172,-0.425785,-0.425785,-0.64379,-1.712281,-0.534592,0.045671,0.052599,0.324558,-0.938786,-0.475411,0.653234,-0.074813,-0.169192,-0.105946,-0.604541,-0.392733,-0.077399,-0.074813,0.902671,-1.42411,0.045419,-0.752901,-0.392652,0.990766,-0.169192,-0.312665,-0.885871,-0.271487,-0.114708,-0.215875,-0.185873,-0.19115,1.963633,-0.043113,-0.144338,-0.144338,1.296762,-0.758962,-0.595207,0.0
1,-0.22064,-0.022381,-0.004694,-0.713738,-0.713738,1.352522,-0.187752,0.281683,0.133276,-0.375976,-0.664238,-0.094035,-0.475411,1.318457,-0.074813,-0.169192,-0.105946,-0.601645,3.601285,0.605054,-0.074813,0.902671,0.702193,-1.568703,-0.752901,-0.392652,0.990766,-0.169192,-0.312665,-0.885871,3.683418,-0.114708,-0.215875,-0.185873,-0.19115,-0.50926,-0.043113,-0.144338,-0.144338,-0.771151,1.317589,-0.595207,1.0
2,-1.071854,0.240519,0.629044,-0.425785,-0.425785,1.352522,-1.051675,-0.738661,-1.00558,1.311539,1.802279,-0.938786,-0.475411,2.601388,-0.074813,-0.169192,-0.105946,-0.395169,-0.392733,2.930452,-0.074813,0.902671,0.702193,-0.223601,-0.752901,2.546784,-1.00932,-0.169192,-0.312665,-0.885871,-0.271487,-0.114708,-0.215875,-0.185873,-0.19115,1.963633,-0.043113,-0.144338,-0.144338,-0.771151,-0.758962,1.680088,0.0
3,-0.22064,0.014648,0.680359,0.15012,0.15012,1.352522,-0.187752,1.506095,0.746505,2.320996,0.561723,-0.938786,-0.475411,-0.914792,-0.074813,-0.169192,-0.105946,1.111594,-0.392733,-0.671386,-0.074813,-1.107823,0.702193,-0.223601,1.328195,-0.392652,-1.00932,-0.169192,-0.312665,-0.885871,-0.271487,-0.114708,-0.215875,5.380004,-0.19115,-0.50926,-0.043113,-0.144338,-0.144338,-0.771151,1.317589,-0.595207,0.0
4,-0.305762,-0.194784,1.090877,-0.425785,-0.425785,1.352522,-0.274111,1.506095,1.972965,-0.350852,-0.6314,0.750715,-0.475411,2.126228,-0.074813,-0.169192,0.113927,0.11763,-0.392733,-0.077399,-0.074813,0.902671,0.702193,-1.568703,-0.752901,2.546784,-1.00932,-0.169192,-0.312665,1.128832,-0.271487,-0.114708,-0.215875,-0.185873,-0.19115,-0.50926,-0.043113,-0.144338,-0.144338,-0.771151,-0.758962,1.680088,1.0


In [5]:
data.shape

(539, 43)

In [6]:
data.loan_status_bin.value_counts()

1.0    461
0.0     78
Name: loan_status_bin, dtype: int64

In [7]:
#Null Model (or random investment) will yield, 59.3% of picking a good loan: 
data.loan_status_bin.value_counts()[1] / len(data)

0.8552875695732839

In [8]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(539, 42)

In [9]:
y = data['loan_status_bin']

In [10]:
n_features = 3
lr = LogisticRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=n_features)

In [11]:
sfs.fit(X,y)

In [12]:
#sfs.get_support()

In [13]:
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(X.columns[sfs.get_support()])))


Top 3 features selected by forward sequential selection:['loan_amnt', 'annual_inc', 'dti']


In [14]:
feat = list(X.columns[sfs.get_support()])
feat

['loan_amnt', 'annual_inc', 'dti']

In [15]:
X_feat = data[feat]

In [16]:
y = data['loan_status_bin']
y

0      0.0
1      1.0
2      0.0
3      0.0
4      1.0
      ... 
534    1.0
535    1.0
536    1.0
537    1.0
538    0.0
Name: loan_status_bin, Length: 539, dtype: float64

In [17]:
lr_check = LogisticRegression()
steps = [('model', lr_check)]
pipeline = Pipeline(steps=steps)

In [18]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.85185185 0.85185185 0.85185185 0.86111111 0.85981308]
0.8552959501557632


In [19]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
steps2 = [('model', ada)]
pipeline2 = Pipeline(steps=steps2)

In [20]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline2, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.81481481 0.83333333 0.83333333 0.85185185 0.8317757 ]
0.8330218068535824


In [21]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1, max_iter=1000)
steps3 = [('model', nn)]
pipeline3 = Pipeline(steps=steps3)

In [22]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline3, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.85185185 0.85185185 0.85185185 0.86111111 0.85981308]
0.8552959501557632


In [23]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
steps4 = [('model', gb)]
pipeline4 = Pipeline(steps=steps4)

In [24]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline4, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.80555556 0.82407407 0.84259259 0.85185185 0.79439252]
0.8236933194877121
