In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_row", 160)
pd.set_option('max_colwidth', 200)

In [3]:
data = pd.read_csv('../../Data/20x43_D860.csv')

In [4]:
data.head(5)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_low,fico_range_high,int_rate,installment,open_acc,total_acc,revol_bal,inq_last_6mths,delinq_2yrs,acc_now_delinq,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,pub_rec,months_cr_line,term,initial_list_status,sub_grade,emp_length,ANY,MORTGAGE,OWN,RENT,car,credit_card,debt_consolidation,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding,Not Verified,Source Verified,Verified,loan_status_bin
0,0.126095,-0.171185,1.284007,-0.832409,-0.832409,-0.628702,0.113224,-0.289595,-1.128516,-0.366924,-0.906273,-0.442383,-0.031863,-0.15631,-0.039654,-0.794874,1.082347,-1.814253,-0.063823,0.881236,-1.281964,0.948529,-0.031863,-0.948582,-0.363362,1.202768,-0.16457,-0.309121,-0.805499,-0.284851,-0.12429,-0.188982,-0.218681,4.927682,-0.545315,-0.055244,-0.132453,-0.177146,-0.055244,-0.69959,-0.761661,1.512173,0.0
1,-1.706372,-0.161231,-0.572577,-0.554658,-0.554658,-0.628702,-1.710281,-0.817722,-0.306473,-0.324679,-0.906273,-0.442383,-0.031863,-0.15631,-0.106365,-0.859867,-0.323706,-0.261957,-0.063823,0.881236,-1.281964,-0.107952,-0.031863,-0.948582,-0.363362,1.202768,-0.16457,-0.309121,-0.805499,-0.284851,-0.12429,-0.188982,-0.218681,-0.202935,1.833803,-0.055244,-0.132453,-0.177146,-0.055244,1.429409,-0.761661,-0.6613,0.0
2,0.237154,0.037836,-0.953746,-0.832409,-0.832409,1.340048,0.279158,-0.113553,-0.224269,-0.575234,-0.025894,-0.442383,-0.031863,-0.15631,-0.106365,-0.857026,3.894453,0.382842,-0.063823,0.881236,0.780053,-1.692673,-0.031863,-0.948582,-0.363362,1.202768,-0.16457,-0.309121,-0.805499,3.510602,-0.12429,-0.188982,-0.218681,-0.202935,-0.545315,-0.055244,-0.132453,-0.177146,-0.055244,-0.69959,1.312919,-0.6613,1.0
3,0.806329,-0.151278,-0.184372,0.556348,0.556348,1.340048,0.856971,-0.113553,-0.964107,-0.641515,-0.906273,-0.442383,-0.031863,-0.15631,-0.106365,-0.704978,-0.323706,-0.429128,-0.063823,0.881236,0.780053,-0.900312,-0.031863,-0.948582,-0.363362,1.202768,-0.16457,-0.309121,-0.805499,-0.284851,-0.12429,-0.188982,-0.218681,-0.202935,-0.545315,-0.055244,-0.132453,5.645057,-0.055244,1.429409,-0.761661,-0.6613,1.0
4,1.070094,-0.186115,0.938023,0.556348,0.556348,1.340048,1.124656,-0.113553,-0.470882,0.020928,-0.906273,-0.442383,-0.031863,-0.15631,-0.106365,0.498734,-0.323706,-0.644061,-0.063823,0.881236,0.780053,-1.428552,-0.031863,1.054205,-0.363362,-0.831416,6.076436,-0.309121,-0.805499,-0.284851,-0.12429,-0.188982,-0.218681,-0.202935,-0.545315,-0.055244,-0.132453,-0.177146,-0.055244,-0.69959,-0.761661,1.512173,0.0


In [5]:
data.shape

(986, 43)

In [6]:
data.loan_status_bin.value_counts()

1.0    828
0.0    158
Name: loan_status_bin, dtype: int64

In [7]:
#Null Model (or random investment) will yield, 59.3% of picking a good loan: 
data.loan_status_bin.value_counts()[1] / len(data)

0.8397565922920892

In [8]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(986, 42)

In [9]:
y = data['loan_status_bin']

In [11]:
n_features = 5
lr = LogisticRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=n_features)

In [12]:
sfs.fit(X,y)

In [13]:
#sfs.get_support()

In [14]:
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(X.columns[sfs.get_support()])))


Top 5 features selected by forward sequential selection:['loan_amnt', 'dti', 'fico_range_low', 'emp_length', 'small_business']


In [15]:
feat = list(X.columns[sfs.get_support()])
feat

['loan_amnt', 'dti', 'fico_range_low', 'emp_length', 'small_business']

In [16]:
X_feat = data[feat]

In [17]:
y = data['loan_status_bin']
y

0      0.0
1      0.0
2      1.0
3      1.0
4      0.0
      ... 
981    0.0
982    0.0
983    1.0
984    1.0
985    0.0
Name: loan_status_bin, Length: 986, dtype: float64

In [18]:
lr_check = LogisticRegression()
steps = [('model', lr_check)]
pipeline = Pipeline(steps=steps)

In [19]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.83838384 0.84263959 0.84263959 0.84263959 0.84263959]
0.8417884428036713


In [20]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
steps2 = [('model', ada)]
pipeline2 = Pipeline(steps=steps2)

In [21]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline2, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.83333333 0.82741117 0.84263959 0.82233503 0.83756345]
0.8326565143824027


In [22]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1, max_iter=1000)
steps3 = [('model', nn)]
pipeline3 = Pipeline(steps=steps3)

In [23]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline3, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.83838384 0.83756345 0.83756345 0.84263959 0.84263959]
0.8397579859508794


In [24]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
steps4 = [('model', gb)]
pipeline4 = Pipeline(steps=steps4)

In [25]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline4, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.84848485 0.84263959 0.82233503 0.82741117 0.82233503]
0.8326411321335179
